[GPU-Plugin] Integration of a faster version of grow_gpu plugin into mainstream (#2360)
* Integrating a faster version of grow_gpu plugin 1. Removed the older files to reduce duplication 2. Moved all of the grow_gpu files under 'exact' folder 3. All of them are inside 'exact' namespace to avoid any conflicts 4. Fixed a bug in benchmark.py while running only 'grow_gpu' plugin 5. Added cub and googletest submodules to ease integration and unit-testing 6. Updates to CMakeLists.txt to directly build cuda objects into libxgboost * Added support for building gpu plugins through make flow 1. updated makefile and config.mk to add right targets 2. added unit-tests for gpu exact plugin code * 1. Added support for building gpu plugin using 'make' flow as well 2. Updated instructions for building and testing gpu plugin * Fix travis-ci errors for PR#2360 1. lint errors on unit-tests 2. removed googletest, instead depended upon dmlc-core provide gtest cache * Some more fixes to travis-ci lint failures PR#2360 * Added Rory's copyrights to the files containing code from both. * updated copyright statement as per Rory's request * moved the static datasets into a script to generate them at runtime * 1. memory usage print when silent=0 2. tests/ and test/ folder organization 3. removal of the dependency of googletest for just building xgboost 4. coding style updates for .cuh as well * Fixes for compilation warnings * add cuda object files as well when JVM_BINDINGS=ON
This commit is contained in:
@@ -7,6 +7,11 @@
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "types.cuh"
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <cstdio>
|
||||
#include "cub/cub.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -180,5 +185,95 @@ struct GpairCallbackOp {
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Helper function to sort the pairs using cub's segmented RadixSortPairs
|
||||
* @param tmp_mem cub temporary memory info
|
||||
* @param keys keys double-buffer array
|
||||
* @param vals the values double-buffer array
|
||||
* @param nVals number of elements in the array
|
||||
* @param nSegs number of segments
|
||||
* @param offsets the segments
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &vals,
|
||||
int nVals, int nSegs, dh::dvec<int> &offsets, int start=0,
|
||||
int end=sizeof(T1)*8) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
|
||||
offsets.data(), offsets.data()+1, start, end));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(),
|
||||
nVals, nSegs, offsets.data(), offsets.data()+1, start, end));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function to perform device-wide sum-reduction
|
||||
* @param tmp_mem cub temporary memory info
|
||||
* @param in the input array to be reduced
|
||||
* @param out the output reduced value
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
int nVals) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(),
|
||||
nVals));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
||||
in.data(), out.data(), nVals));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Fill a given constant value across all elements in the buffer
|
||||
* @param out the buffer to be filled
|
||||
* @param len number of elements i the buffer
|
||||
* @param def default value to be filled
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
void fillConst(T* out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>(len, [=] __device__(int i) { out[i] = def; });
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief gather elements
|
||||
* @param out1 output gathered array for the first buffer
|
||||
* @param in1 first input buffer
|
||||
* @param out2 output gathered array for the second buffer
|
||||
* @param in2 second input buffer
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
|
||||
void gather(T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
||||
int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
T2 v2 = in2[iid];
|
||||
out1[i] = v1;
|
||||
out2[i] = v2;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief gather elements
|
||||
* @param out output gathered array
|
||||
* @param in input buffer
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
void gather(T* out, const T* in, const int* instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
@@ -2,8 +2,6 @@
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/random.h>
|
||||
#include <thrust/system/cuda/error.h>
|
||||
@@ -14,6 +12,7 @@
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
// Uncomment to enable
|
||||
// #define DEVICE_TIMER
|
||||
@@ -21,6 +20,9 @@
|
||||
|
||||
namespace dh {
|
||||
|
||||
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
||||
#define DEV_INLINE __device__ __forceinline__
|
||||
|
||||
/*
|
||||
* Error handling functions
|
||||
*/
|
||||
@@ -145,7 +147,7 @@ struct Timer {
|
||||
int64_t elapsed() const { return (ClockT::now() - start).count(); }
|
||||
void printElapsed(std::string label) {
|
||||
safe_cuda(cudaDeviceSynchronize());
|
||||
printf("%s:\t %lld\n", label.c_str(), elapsed());
|
||||
printf("%s:\t %lld\n", label.c_str(), (long long)elapsed());
|
||||
reset();
|
||||
}
|
||||
};
|
||||
@@ -228,10 +230,12 @@ __device__ void block_fill(IterT begin, size_t n, ValueT value) {
|
||||
*/
|
||||
|
||||
class bulk_allocator;
|
||||
template <typename T> class dvec2;
|
||||
|
||||
template <typename T>
|
||||
class dvec {
|
||||
friend bulk_allocator;
|
||||
friend class dvec2<T>;
|
||||
|
||||
private:
|
||||
T *_ptr;
|
||||
@@ -241,15 +245,17 @@ class dvec {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec but already allocated");
|
||||
}
|
||||
|
||||
_ptr = static_cast<T *>(ptr);
|
||||
_size = size;
|
||||
}
|
||||
|
||||
public:
|
||||
dvec() : _ptr(NULL), _size(0) {}
|
||||
|
||||
size_t size() const { return _size; }
|
||||
|
||||
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||
|
||||
T *data() { return _ptr; }
|
||||
|
||||
std::vector<T> as_vector() const {
|
||||
@@ -265,11 +271,9 @@ class dvec {
|
||||
|
||||
void print() {
|
||||
auto h_vector = this->as_vector();
|
||||
|
||||
for (auto e : h_vector) {
|
||||
std::cout << e << " ";
|
||||
}
|
||||
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
@@ -285,9 +289,7 @@ class dvec {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
}
|
||||
|
||||
thrust::copy(other.begin(), other.end(), this->tbegin());
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -296,36 +298,74 @@ class dvec {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign dvec to dvec, sizes are different");
|
||||
}
|
||||
|
||||
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
|
||||
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @class dvec2 device_helpers.cuh
|
||||
* @brief wrapper for storing 2 dvec's which are needed for cub::DoubleBuffer
|
||||
*/
|
||||
template <typename T>
|
||||
class dvec2 {
|
||||
friend bulk_allocator;
|
||||
|
||||
private:
|
||||
dvec<T> _d1, _d2;
|
||||
cub::DoubleBuffer<T> _buff;
|
||||
|
||||
void external_allocate(void *ptr1, void *ptr2, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
||||
}
|
||||
_d1.external_allocate(ptr1, size);
|
||||
_d2.external_allocate(ptr2, size);
|
||||
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
||||
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
||||
_buff.selector = 0;
|
||||
}
|
||||
|
||||
public:
|
||||
dvec2() : _d1(), _d2(), _buff() {}
|
||||
|
||||
size_t size() const { return _d1.size(); }
|
||||
|
||||
bool empty() const { return _d1.empty() || _d2.empty(); }
|
||||
|
||||
cub::DoubleBuffer<T> &buff() { return _buff; }
|
||||
|
||||
dvec<T> &d1() { return _d1; }
|
||||
|
||||
dvec<T> &d2() { return _d2; }
|
||||
|
||||
T *current() { return _buff.Current(); }
|
||||
|
||||
dvec<T> ¤t_dvec() { return _buff.selector == 0? d1() : d2(); }
|
||||
|
||||
T *other() { return _buff.Alternate(); }
|
||||
};
|
||||
|
||||
class bulk_allocator {
|
||||
char *d_ptr;
|
||||
size_t _size;
|
||||
|
||||
const size_t align = 256;
|
||||
const int align = 256;
|
||||
|
||||
template <typename SizeT>
|
||||
size_t align_round_up(SizeT n) {
|
||||
if (n % align == 0) {
|
||||
return n;
|
||||
} else {
|
||||
return n + align - (n % align);
|
||||
}
|
||||
n = (n + align - 1) / align;
|
||||
return n * align;
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size) {
|
||||
return align_round_up(first_size * sizeof(T));
|
||||
return align_round_up<SizeT>(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return align_round_up(first_size * sizeof(T)) + get_size_bytes(args...);
|
||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
@@ -336,11 +376,37 @@ class bulk_allocator {
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(char *ptr, dvec<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
first_vec->external_allocate(static_cast<void *>(ptr), first_size);
|
||||
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
|
||||
ptr += align_round_up(first_size * sizeof(T));
|
||||
allocate_dvec(ptr, args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size) {
|
||||
return 2 * align_round_up(first_size * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size) {
|
||||
first_vec->external_allocate
|
||||
(static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(char *ptr, dvec2<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
allocate_dvec<T,SizeT>(ptr, first_vec, first_size);
|
||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||
allocate_dvec(ptr, args...);
|
||||
}
|
||||
|
||||
public:
|
||||
bulk_allocator() : _size(0), d_ptr(NULL) {}
|
||||
|
||||
@@ -357,11 +423,8 @@ class bulk_allocator {
|
||||
if (d_ptr != NULL) {
|
||||
throw std::runtime_error("Bulk allocator already allocated");
|
||||
}
|
||||
|
||||
_size = get_size_bytes(args...);
|
||||
|
||||
safe_cuda(cudaMalloc(&d_ptr, _size));
|
||||
|
||||
allocate_dvec(d_ptr, args...);
|
||||
}
|
||||
};
|
||||
@@ -374,6 +437,7 @@ struct CubMemory {
|
||||
CubMemory() : d_temp_storage(NULL), temp_storage_bytes(0) {}
|
||||
|
||||
~CubMemory() { Free(); }
|
||||
|
||||
void Free() {
|
||||
if (d_temp_storage != NULL) {
|
||||
safe_cuda(cudaFree(d_temp_storage));
|
||||
@@ -394,13 +458,13 @@ struct CubMemory {
|
||||
inline size_t available_memory() {
|
||||
size_t device_free = 0;
|
||||
size_t device_total = 0;
|
||||
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
|
||||
safe_cuda(cudaMemGetInfo(&device_free, &device_total));
|
||||
return device_free;
|
||||
}
|
||||
|
||||
inline std::string device_name() {
|
||||
cudaDeviceProp prop;
|
||||
dh::safe_cuda(cudaGetDeviceProperties(&prop, 0));
|
||||
safe_cuda(cudaGetDeviceProperties(&prop, 0));
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
@@ -430,7 +494,6 @@ template <typename T>
|
||||
void print(char *label, const thrust::device_vector<T> &v,
|
||||
const char *format = "%d ", int max = 10) {
|
||||
thrust::host_vector<T> h_v = v;
|
||||
|
||||
std::cout << label << ":\n";
|
||||
for (int i = 0; i < std::min(static_cast<int>(h_v.size()), max); i++) {
|
||||
printf(format, h_v[i]);
|
||||
@@ -495,9 +558,21 @@ struct BernoulliRng {
|
||||
thrust::default_random_engine rng(seed);
|
||||
thrust::uniform_real_distribution<float> dist;
|
||||
rng.discard(i);
|
||||
|
||||
return dist(rng) <= p;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Helper macro to measure timing on GPU
|
||||
* @param call the GPU call
|
||||
* @param name name used to track later
|
||||
* @param stream cuda stream where to measure time
|
||||
*/
|
||||
#define TIMEIT(call, name) \
|
||||
do { \
|
||||
dh::Timer t1234; \
|
||||
call; \
|
||||
t1234.printElapsed(name); \
|
||||
} while(0)
|
||||
|
||||
} // namespace dh
|
||||
|
||||
192
plugin/updater_gpu/src/exact/argmax_by_key.cuh
Normal file
192
plugin/updater_gpu/src/exact/argmax_by_key.cuh
Normal file
@@ -0,0 +1,192 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "../common.cuh"
|
||||
#include "node.cuh"
|
||||
#include "loss_functions.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
/**
|
||||
* @enum ArgMaxByKeyAlgo best_split_evaluation.cuh
|
||||
* @brief Help decide which algorithm to use for multi-argmax operation
|
||||
*/
|
||||
enum ArgMaxByKeyAlgo {
|
||||
/** simplest, use gmem-atomics for all updates */
|
||||
ABK_GMEM = 0,
|
||||
/** use smem-atomics for updates (when number of keys are less) */
|
||||
ABK_SMEM
|
||||
};
|
||||
|
||||
/** max depth until which to use shared mem based atomics for argmax */
|
||||
static const int MAX_ABK_LEVELS = 3;
|
||||
|
||||
HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
|
||||
Split out;
|
||||
if (a.score < b.score) {
|
||||
out.score = b.score;
|
||||
out.index = b.index;
|
||||
} else if (a.score == b.score) {
|
||||
out.score = a.score;
|
||||
out.index = (a.index < b.index)? a.index : b.index;
|
||||
} else {
|
||||
out.score = a.score;
|
||||
out.index = a.index;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
||||
unsigned long long* intAddress = (unsigned long long*) address;
|
||||
unsigned long long old = *intAddress;
|
||||
unsigned long long assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
Split res = maxSplit(val, *(Split*)&assumed);
|
||||
old = atomicCAS(intAddress, assumed, *(unsigned long long*)&res);
|
||||
} while (assumed != old);
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals,
|
||||
const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam ¶m) {
|
||||
int nodeId = nodeAssigns[id];
|
||||
///@todo: this is really a bad check! but will be fixed when we move
|
||||
/// to key-based reduction
|
||||
if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
|
||||
(colIds[id] == colIds[id-1]) &&
|
||||
(vals[id] == vals[id-1]))) {
|
||||
if (nodeId != UNUSED_NODE) {
|
||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
|
||||
nUniqKeys);
|
||||
gpu_gpair colSum = gradSums[sumId];
|
||||
int uid = nodeId - nodeStart;
|
||||
Node<node_id_t> n = nodes[nodeId];
|
||||
gpu_gpair parentSum = n.gradSum;
|
||||
float parentGain = n.score;
|
||||
bool tmp;
|
||||
Split s;
|
||||
gpu_gpair missing = parentSum - colSum;
|
||||
s.score = loss_chg_missing(gradScans[id], missing, parentSum,
|
||||
parentGain, param, tmp);
|
||||
s.index = id;
|
||||
atomicArgMax(nodeSplits+uid, s);
|
||||
} // end if nodeId != UNUSED_NODE
|
||||
} // end if id == 0 ...
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
argMaxWithAtomics(id, nodeSplits, gradScans, gradSums, vals, colIds,
|
||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
extern __shared__ char sArr[];
|
||||
Split* sNodeSplits = (Split*)sArr;
|
||||
int tid = threadIdx.x;
|
||||
Split defVal;
|
||||
#pragma unroll 1
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
sNodeSplits[i] = defVal;
|
||||
}
|
||||
__syncthreads();
|
||||
int id = tid + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
argMaxWithAtomics(id, sNodeSplits, gradScans, gradSums, vals, colIds,
|
||||
nodeAssigns, nodes, nUniqKeys, nodeStart, len, param);
|
||||
}
|
||||
__syncthreads();
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
Split s = sNodeSplits[i];
|
||||
atomicArgMax(nodeSplits+i, s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs argmax_by_key functionality but for cases when keys need not
|
||||
* occur contiguously
|
||||
* @param nodeSplits will contain information on best split for each node
|
||||
* @param gradScans exclusive sum on sorted segments for each col
|
||||
* @param gradSums gradient sum for each column in DMatrix based on to node-ids
|
||||
* @param vals feature values
|
||||
* @param colIds column index for each element in the feature values array
|
||||
* @param nodeAssigns node-id assignments to each element in DMatrix
|
||||
* @param nodes pointer to all nodes for this tree in BFS order
|
||||
* @param nUniqKeys number of unique node-ids in this level
|
||||
* @param nodeStart start index of the node-ids in this level
|
||||
* @param len number of elements
|
||||
* @param param training parameters
|
||||
* @param algo which algorithm to use for argmax_by_key
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
|
||||
void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param,
|
||||
ArgMaxByKeyAlgo algo) {
|
||||
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
|
||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
|
||||
switch(algo) {
|
||||
case ABK_GMEM:
|
||||
atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
|
||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
case ABK_SMEM:
|
||||
atomicArgMaxByKeySmem<node_id_t>
|
||||
<<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
|
||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||
};
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
200
plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
Normal file
200
plugin/updater_gpu/src/exact/fused_scan_reduce_by_key.cuh
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../common.cuh"
|
||||
#include "gradients.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
/**
|
||||
* @struct Pair fused_scan_reduce_by_key.cuh
|
||||
* @brief Pair used for key basd scan operations on gpu_gpair
|
||||
*/
|
||||
struct Pair {
|
||||
int key;
|
||||
gpu_gpair value;
|
||||
};
|
||||
|
||||
/** define a key that's not used at all in the entire boosting process */
|
||||
static const int NONE_KEY = -100;
|
||||
|
||||
/**
|
||||
* @brief Allocate temporary buffers needed for scan operations
|
||||
* @param tmpScans gradient buffer
|
||||
* @param tmpKeys keys buffer
|
||||
* @param size number of elements that will be scanned
|
||||
*/
|
||||
template <int BLKDIM_L1L3=256>
|
||||
int scanTempBufferSize(int size) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
return nBlks;
|
||||
}
|
||||
|
||||
struct AddByKey {
|
||||
template <typename T>
|
||||
HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
|
||||
T result;
|
||||
if (first.key == second.key) {
|
||||
result.key = first.key;
|
||||
result.value = first.value + second.value;
|
||||
} else {
|
||||
result.key = second.key;
|
||||
result.value = second.value;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename node_id_t, int BLKDIM_L1L3>
|
||||
__global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
||||
const int* instIds, gpu_gpair* mScans,
|
||||
int* mKeys, const node_id_t* keys, int nUniqKeys,
|
||||
const int* colIds, node_id_t nodeStart,
|
||||
const int size) {
|
||||
Pair rootPair = {NONE_KEY, gpu_gpair(0.f, 0.f)};
|
||||
int myKey;
|
||||
gpu_gpair myValue;
|
||||
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
Pair threadData;
|
||||
int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
|
||||
if (tid < size) {
|
||||
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
myValue = get(tid, vals, instIds);
|
||||
} else {
|
||||
myKey = NONE_KEY;
|
||||
myValue = 0.f;
|
||||
}
|
||||
threadData.key = myKey;
|
||||
threadData.value = myValue;
|
||||
// get previous key, especially needed for the last thread in this block
|
||||
// in order to pass on the partial scan values.
|
||||
// this statement MUST appear before the checks below!
|
||||
// else, the result of this shuffle operation will be undefined
|
||||
int previousKey = __shfl_up(myKey, 1);
|
||||
// Collectively compute the block-wide exclusive prefix sum
|
||||
BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
|
||||
AddByKey());
|
||||
if (tid < size) {
|
||||
scans[tid] = threadData.value;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
||||
threadData.value = (myKey == previousKey)?
|
||||
threadData.value :
|
||||
gpu_gpair(0.0f, 0.0f);
|
||||
mKeys[blockIdx.x] = myKey;
|
||||
mScans[blockIdx.x] = threadData.value + myValue;
|
||||
}
|
||||
}
|
||||
|
||||
template <int BLKSIZE>
|
||||
__global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
|
||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
||||
Pair threadData;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
|
||||
threadData.key = mKeys[i];
|
||||
threadData.value = mScans[i];
|
||||
BlockScan(temp_storage).InclusiveScan(threadData, threadData,
|
||||
AddByKey());
|
||||
mScans[i] = threadData.value;
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename node_id_t, int BLKDIM_L1L3>
|
||||
__global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
||||
const gpu_gpair* vals, const int* instIds,
|
||||
const gpu_gpair* mScans, const int* mKeys,
|
||||
const node_id_t* keys, int nUniqKeys,
|
||||
const int* colIds, node_id_t nodeStart,
|
||||
const int size) {
|
||||
int relId = threadIdx.x;
|
||||
int tid = (blockIdx.x * BLKDIM_L1L3) + relId;
|
||||
// to avoid the following warning from nvcc:
|
||||
// __shared__ memory variable with non-empty constructor or destructor
|
||||
// (potential race between threads)
|
||||
__shared__ char gradBuff[sizeof(gpu_gpair)];
|
||||
__shared__ int s_mKeys;
|
||||
gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
|
||||
if(tid >= size)
|
||||
return;
|
||||
// cache block-wide partial scan info
|
||||
if (relId == 0) {
|
||||
s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
|
||||
s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
|
||||
}
|
||||
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
|
||||
nodeStart, nUniqKeys);
|
||||
gpu_gpair myValue = scans[tid];
|
||||
__syncthreads();
|
||||
if (blockIdx.x > 0 && s_mKeys == previousKey) {
|
||||
myValue += s_mScans[0];
|
||||
}
|
||||
if (tid == size - 1) {
|
||||
sums[previousKey] = myValue + get(tid, vals, instIds);
|
||||
}
|
||||
if ((previousKey != myKey) && (previousKey >= 0)) {
|
||||
sums[previousKey] = myValue;
|
||||
myValue = gpu_gpair(0.0f, 0.0f);
|
||||
}
|
||||
scans[tid] = myValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs fused reduce and scan by key functionality. It is assumed that
|
||||
* the keys occur contiguously!
|
||||
* @param sums the output gradient reductions for each element performed key-wise
|
||||
* @param scans the output gradient scans for each element performed key-wise
|
||||
* @param vals the gradients evaluated for each observation.
|
||||
* @param instIds instance ids for each element
|
||||
* @param keys keys to be used to segment the reductions. They need not occur
|
||||
* contiguously in contrast to scan_by_key. Currently, we need one key per
|
||||
* value in the 'vals' array.
|
||||
* @param size number of elements in the 'vals' array
|
||||
* @param nUniqKeys max number of uniq keys found per column
|
||||
* @param nCols number of columns
|
||||
* @param tmpScans temporary scan buffer needed for cub-pyramid algo
|
||||
* @param tmpKeys temporary key buffer needed for cub-pyramid algo
|
||||
* @param colIds column indices for each element in the array
|
||||
* @param nodeStart index of the leftmost node in the current level
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
|
||||
void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
|
||||
const int* instIds, const node_id_t* keys, int size,
|
||||
int nUniqKeys, int nCols, gpu_gpair* tmpScans,
|
||||
int* tmpKeys, const int* colIds, node_id_t nodeStart) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
|
||||
cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
||||
(scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
||||
cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
||||
(sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
391
plugin/updater_gpu/src/exact/gpu_builder.cuh
Normal file
391
plugin/updater_gpu/src/exact/gpu_builder.cuh
Normal file
@@ -0,0 +1,391 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "xgboost/tree_updater.h"
|
||||
#include "cub/cub.cuh"
|
||||
#include "../common.cuh"
|
||||
#include <vector>
|
||||
#include "loss_functions.cuh"
|
||||
#include "gradients.cuh"
|
||||
#include "node.cuh"
|
||||
#include "argmax_by_key.cuh"
|
||||
#include "split2node.cuh"
|
||||
#include "fused_scan_reduce_by_key.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void initRootNode(Node<node_id_t>* nodes, const gpu_gpair* sums,
|
||||
const TrainParam param) {
|
||||
// gradients already evaluated inside transferGrads
|
||||
Node<node_id_t> n;
|
||||
n.gradSum = sums[0];
|
||||
n.score = CalcGain(param, n.gradSum.g, n.gradSum.h);
|
||||
n.weight = CalcWeight(param, n.gradSum.g, n.gradSum.h);
|
||||
n.id = 0;
|
||||
nodes[0] = n;
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void assignColIds(int* colIds, const int* colOffsets) {
|
||||
int myId = blockIdx.x;
|
||||
int start = colOffsets[myId];
|
||||
int end = colOffsets[myId+1];
|
||||
for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
|
||||
colIds[id] = myId;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
||||
const Node<node_id_t>* nodes, int nRows) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
if (id >= nRows) {
|
||||
return;
|
||||
}
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
node_id_t nId = nodeIdsPerInst[id];
|
||||
if (nId == UNUSED_NODE) {
|
||||
return;
|
||||
}
|
||||
const Node<node_id_t> n = nodes[nId];
|
||||
node_id_t result;
|
||||
if (n.isLeaf() || n.isUnused()) {
|
||||
result = UNUSED_NODE;
|
||||
} else if(n.isDefaultLeft()) {
|
||||
result = (2 * n.id) + 1;
|
||||
} else {
|
||||
result = (2 * n.id) + 2;
|
||||
}
|
||||
nodeIdsPerInst[id] = result;
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
const node_id_t* nodeIds, const int* instId,
|
||||
const Node<node_id_t>* nodes, const int* colOffsets,
|
||||
const float* vals, int nVals, int nCols) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < nVals; id += stride) {
|
||||
// fusing generation of indices for node locations
|
||||
nodeLocations[id] = id;
|
||||
// using nodeIds here since the previous kernel would have updated
|
||||
// the nodeIdsPerInst with all default assignments
|
||||
int nId = nodeIds[id];
|
||||
// if this element belongs to none of the currently active node-id's
|
||||
if (nId != UNUSED_NODE) {
|
||||
const Node<node_id_t> n = nodes[nId];
|
||||
int colId = n.colIdx;
|
||||
//printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||
int start = colOffsets[colId];
|
||||
int end = colOffsets[colId + 1];
|
||||
///@todo: too much wasteful threads!!
|
||||
if ((id >= start) && (id < end) && !(n.isLeaf() || n.isUnused())) {
|
||||
node_id_t result = (2 * n.id) + 1 + (vals[id] >= n.threshold);
|
||||
nodeIdsPerInst[instId[id]] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
|
||||
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if ((id < len) && !nodes[id].isUnused()) {
|
||||
int lid = (id << 1) + 1;
|
||||
int rid = (id << 1) + 2;
|
||||
if ((lid >= len) || (rid >= len)) {
|
||||
nodes[id].score = -FLT_MAX; // bottom-most nodes
|
||||
} else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
|
||||
nodes[id].score = -FLT_MAX; // unused child nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// unit test forward declaration for friend function access
|
||||
template <typename node_id_t> void testSmallData();
|
||||
template <typename node_id_t> void testLargeData();
|
||||
template <typename node_id_t> void testAllocate();
|
||||
template <typename node_id_t> void testMarkLeaves();
|
||||
template <typename node_id_t> void testDense2Sparse();
|
||||
template <typename node_id_t> class GPUBuilder;
|
||||
template <typename node_id_t>
|
||||
std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
||||
const std::string& file,
|
||||
xgboost::tree::exact::GPUBuilder<node_id_t>& builder);
|
||||
|
||||
template <typename node_id_t>
|
||||
class GPUBuilder {
|
||||
public:
|
||||
GPUBuilder(): allocated(false) {}
|
||||
|
||||
~GPUBuilder() {}
|
||||
|
||||
void Init(const TrainParam& p) {
|
||||
param = p;
|
||||
maxNodes = (1 << (param.max_depth + 1)) - 1;
|
||||
maxLeaves = 1 << param.max_depth;
|
||||
}
|
||||
|
||||
void UpdateParam(const TrainParam ¶m) { this->param = param; }
|
||||
|
||||
/// @note: Update should be only after Init!!
|
||||
void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
|
||||
RegTree* hTree) {
|
||||
if (!allocated) {
|
||||
setupOneTimeData(*hMat);
|
||||
}
|
||||
for (int i = 0; i < param.max_depth; ++i) {
|
||||
if (i == 0) {
|
||||
// make sure to start on a fresh tree with sorted values!
|
||||
vals.current_dvec() = vals_cached;
|
||||
instIds.current_dvec() = instIds_cached;
|
||||
transferGrads(gpair);
|
||||
}
|
||||
int nNodes = 1 << i;
|
||||
node_id_t nodeStart = nNodes - 1;
|
||||
initNodeData(i, nodeStart, nNodes);
|
||||
findSplit(i, nodeStart, nNodes);
|
||||
}
|
||||
// mark all the used nodes with unused children as leaf nodes
|
||||
markLeaves();
|
||||
dense2sparse(*hTree);
|
||||
}
|
||||
|
||||
private:
|
||||
friend void testSmallData<node_id_t>();
|
||||
friend void testLargeData<node_id_t>();
|
||||
friend void testAllocate<node_id_t>();
|
||||
friend void testMarkLeaves<node_id_t>();
|
||||
friend void testDense2Sparse<node_id_t>();
|
||||
friend std::shared_ptr<xgboost::DMatrix> setupGPUBuilder<node_id_t>(
|
||||
const std::string& file, GPUBuilder<node_id_t>& builder);
|
||||
|
||||
TrainParam param;
|
||||
/** whether we have initialized memory already (so as not to repeat!) */
|
||||
bool allocated;
|
||||
/** feature values stored in column-major compressed format */
|
||||
dh::dvec2<float> vals;
|
||||
dh::dvec<float> vals_cached;
|
||||
/** corresponding instance id's of these featutre values */
|
||||
dh::dvec2<int> instIds;
|
||||
dh::dvec<int> instIds_cached;
|
||||
/** column offsets for these feature values */
|
||||
dh::dvec<int> colOffsets;
|
||||
dh::dvec<gpu_gpair> gradsInst;
|
||||
dh::dvec2<node_id_t> nodeAssigns;
|
||||
dh::dvec2<int> nodeLocations;
|
||||
dh::dvec<Node<node_id_t> > nodes;
|
||||
dh::dvec<node_id_t> nodeAssignsPerInst;
|
||||
dh::dvec<gpu_gpair> gradSums;
|
||||
dh::dvec<gpu_gpair> gradScans;
|
||||
dh::dvec<Split> nodeSplits;
|
||||
int nVals;
|
||||
int nRows;
|
||||
int nCols;
|
||||
int maxNodes;
|
||||
int maxLeaves;
|
||||
dh::CubMemory tmp_mem;
|
||||
dh::dvec<gpu_gpair> tmpScanGradBuff;
|
||||
dh::dvec<int> tmpScanKeyBuff;
|
||||
dh::dvec<int> colIds;
|
||||
dh::bulk_allocator ba;
|
||||
|
||||
void findSplit(int level, node_id_t nodeStart, int nNodes) {
|
||||
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
|
||||
instIds.current(), nodeAssigns.current(), nVals, nNodes,
|
||||
nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
|
||||
colIds.data(), nodeStart);
|
||||
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
|
||||
vals.current(), colIds.data(), nodeAssigns.current(),
|
||||
nodes.data(), nNodes, nodeStart, nVals, param,
|
||||
level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
|
||||
split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
|
||||
gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
|
||||
nodeAssigns.current(), nNodes, nodeStart, nCols, param);
|
||||
}
|
||||
|
||||
void allocateAllData(int offsetSize) {
|
||||
int tmpBuffSize = scanTempBufferSize(nVals);
|
||||
ba.allocate(&vals, nVals,
|
||||
&vals_cached, nVals,
|
||||
&instIds, nVals,
|
||||
&instIds_cached, nVals,
|
||||
&colOffsets, offsetSize,
|
||||
&gradsInst, nRows,
|
||||
&nodeAssigns, nVals,
|
||||
&nodeLocations, nVals,
|
||||
&nodes, maxNodes,
|
||||
&nodeAssignsPerInst, nRows,
|
||||
&gradSums, maxLeaves*nCols,
|
||||
&gradScans, nVals,
|
||||
&nodeSplits, maxLeaves,
|
||||
&tmpScanGradBuff, tmpBuffSize,
|
||||
&tmpScanKeyBuff, tmpBuffSize,
|
||||
&colIds, nVals);
|
||||
}
|
||||
|
||||
void setupOneTimeData(DMatrix& hMat) {
|
||||
size_t free_memory = dh::available_memory();
|
||||
if (!hMat.SingleColBlock()) {
|
||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||
}
|
||||
std::vector<float> fval;
|
||||
std::vector<int> fId, offset;
|
||||
convertToCsc(hMat, fval, fId, offset);
|
||||
allocateAllData((int)offset.size());
|
||||
transferAndSortData(fval, fId, offset);
|
||||
allocated = true;
|
||||
if (!param.silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||
<< free_memory / mb_size << " MB on " << dh::device_name();
|
||||
}
|
||||
}
|
||||
|
||||
void convertToCsc(DMatrix& hMat, std::vector<float>& fval,
|
||||
std::vector<int>& fId, std::vector<int>& offset) {
|
||||
MetaInfo info = hMat.info();
|
||||
nRows = info.num_row;
|
||||
nCols = info.num_col;
|
||||
offset.reserve(nCols + 1);
|
||||
offset.push_back(0);
|
||||
fval.reserve(nCols * nRows);
|
||||
fId.reserve(nCols * nRows);
|
||||
// in case you end up with a DMatrix having no column access
|
||||
// then make sure to enable that before copying the data!
|
||||
if (!hMat.HaveColAccess()) {
|
||||
const std::vector<bool> enable(nCols, true);
|
||||
hMat.InitColAccess(enable, 1, nRows);
|
||||
}
|
||||
dmlc::DataIter<ColBatch>* iter = hMat.ColIterator();
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch& batch = iter->Value();
|
||||
for (int i=0;i<batch.size;i++) {
|
||||
const ColBatch::Inst& col = batch[i];
|
||||
for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
|
||||
int inst_id = static_cast<int>(it->index);
|
||||
fval.push_back(it->fvalue);
|
||||
fId.push_back(inst_id);
|
||||
}
|
||||
offset.push_back(fval.size());
|
||||
}
|
||||
}
|
||||
nVals = fval.size();
|
||||
}
|
||||
|
||||
void transferAndSortData(const std::vector<float>& fval,
|
||||
const std::vector<int>& fId,
|
||||
const std::vector<int>& offset) {
|
||||
vals.current_dvec() = fval;
|
||||
instIds.current_dvec() = fId;
|
||||
colOffsets = offset;
|
||||
segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
|
||||
vals_cached = vals.current_dvec();
|
||||
instIds_cached = instIds.current_dvec();
|
||||
assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
|
||||
}
|
||||
|
||||
void transferGrads(const std::vector<bst_gpair>& gpair) {
|
||||
// HACK
|
||||
dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
|
||||
sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
|
||||
// evaluate the full-grad reduction for the root node
|
||||
sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||
}
|
||||
|
||||
void initNodeData(int level, node_id_t nodeStart, int nNodes) {
|
||||
// all instances belong to root node at the beginning!
|
||||
if (level == 0) {
|
||||
nodes.fill(Node<node_id_t>());
|
||||
nodeAssigns.current_dvec().fill(0);
|
||||
nodeAssignsPerInst.fill(0);
|
||||
// for root node, just update the gradient/score/weight/id info
|
||||
// before splitting it! Currently all data is on GPU, hence this
|
||||
// stupid little kernel
|
||||
initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
|
||||
} else {
|
||||
const int BlkDim = 256;
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = dh::div_round_up(nRows, BlkDim);
|
||||
fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodes.data(), nRows);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
|
||||
assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodeLocations.current(),
|
||||
nodeAssigns.current(),
|
||||
instIds.current(), nodes.data(),
|
||||
colOffsets.data(), vals.current(),
|
||||
nVals, nCols);
|
||||
// gather the node assignments across all other columns too
|
||||
gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
|
||||
instIds.current(), nVals);
|
||||
sortKeys(level);
|
||||
}
|
||||
}
|
||||
|
||||
void sortKeys(int level) {
|
||||
// segmented-sort the arrays based on node-id's
|
||||
// but we don't need more than level+1 bits for sorting!
|
||||
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
|
||||
0, level+1);
|
||||
gather<float,int>(vals.other(), vals.current(), instIds.other(),
|
||||
instIds.current(), nodeLocations.current(), nVals);
|
||||
vals.buff().selector ^= 1;
|
||||
instIds.buff().selector ^= 1;
|
||||
}
|
||||
|
||||
void markLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = dh::div_round_up(maxNodes, BlkDim);
|
||||
markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
|
||||
}
|
||||
|
||||
void dense2sparse(RegTree &tree) {
|
||||
std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
|
||||
int nodeId = 0;
|
||||
for (int i = 0; i < maxNodes; ++i) {
|
||||
const Node<node_id_t>& n = hNodes[i];
|
||||
if ((i != 0) && hNodes[i].isLeaf()) {
|
||||
tree[nodeId].set_leaf(n.weight * param.learning_rate);
|
||||
tree.stat(nodeId).sum_hess = n.gradSum.h;
|
||||
++nodeId;
|
||||
} else if (!hNodes[i].isUnused()) {
|
||||
tree.AddChilds(nodeId);
|
||||
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
|
||||
tree.stat(nodeId).loss_chg = n.score;
|
||||
tree.stat(nodeId).sum_hess = n.gradSum.h;
|
||||
tree.stat(nodeId).base_weight = n.weight;
|
||||
tree[tree[nodeId].cleft()].set_leaf(0);
|
||||
tree[tree[nodeId].cright()].set_leaf(0);
|
||||
++nodeId;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
91
plugin/updater_gpu/src/exact/gradients.cuh
Normal file
91
plugin/updater_gpu/src/exact/gradients.cuh
Normal file
@@ -0,0 +1,91 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../common.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
/**
|
||||
* @struct gpu_gpair gradients.cuh
|
||||
* @brief The first/second order gradients for iteratively building the tree
|
||||
*/
|
||||
struct gpu_gpair {
|
||||
/** the 'g_i' as it appears in the xgboost paper */
|
||||
float g;
|
||||
/** the 'h_i' as it appears in the xgboost paper */
|
||||
float h;
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
|
||||
|
||||
/**
|
||||
* @brief Checks whether the hessian is more than the defined weight
|
||||
* @param minWeight minimum weight to be compared against
|
||||
* @return true if the hessian is greater than the minWeight
|
||||
* @note this is useful in deciding whether to further split to child node
|
||||
*/
|
||||
HOST_DEV_INLINE bool isSplittable(float minWeight) const {
|
||||
return (h > minWeight);
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair& operator+=(const gpu_gpair& a) {
|
||||
g += a.g;
|
||||
h += a.h;
|
||||
return *this;
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair& operator-=(const gpu_gpair& a) {
|
||||
g -= a.g;
|
||||
h -= a.h;
|
||||
return *this;
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
|
||||
const gpu_gpair& b) {
|
||||
return gpu_gpair(a.g+b.g, a.h+b.h);
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
|
||||
const gpu_gpair& b) {
|
||||
return gpu_gpair(a.g-b.g, a.h-b.h);
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair(int value) {
|
||||
*this = gpu_gpair((float)value, (float)value);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Gradient value getter function
|
||||
* @param id the index into the vals or instIds array to which to fetch
|
||||
* @param vals the gradient value buffer
|
||||
* @param instIds instance index buffer
|
||||
* @return the expected gradient value
|
||||
*/
|
||||
HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
|
||||
id = instIds[id];
|
||||
return vals[id];
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
63
plugin/updater_gpu/src/exact/loss_functions.cuh
Normal file
63
plugin/updater_gpu/src/exact/loss_functions.cuh
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../common.cuh"
|
||||
#include "gradients.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam ¶m,
|
||||
const gpu_gpair &scan,
|
||||
const gpu_gpair &missing,
|
||||
const gpu_gpair &parent_sum,
|
||||
const float &parent_gain,
|
||||
bool missing_left) {
|
||||
gpu_gpair left = scan;
|
||||
if (missing_left) {
|
||||
left += missing;
|
||||
}
|
||||
gpu_gpair right = parent_sum - left;
|
||||
float left_gain = CalcGain(param, left.g, left.h);
|
||||
float right_gain = CalcGain(param, right.g, right.h);
|
||||
return left_gain + right_gain - parent_gain;
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE float loss_chg_missing(const gpu_gpair &scan,
|
||||
const gpu_gpair &missing,
|
||||
const gpu_gpair &parent_sum,
|
||||
const float &parent_gain,
|
||||
const TrainParam ¶m,
|
||||
bool &missing_left_out) {
|
||||
float missing_left_loss =
|
||||
device_calc_loss_chg(param, scan, missing, parent_sum, parent_gain, true);
|
||||
float missing_right_loss = device_calc_loss_chg(
|
||||
param, scan, missing, parent_sum, parent_gain, false);
|
||||
if (missing_left_loss >= missing_right_loss) {
|
||||
missing_left_out = true;
|
||||
return missing_left_loss;
|
||||
} else {
|
||||
missing_left_out = false;
|
||||
return missing_right_loss;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
158
plugin/updater_gpu/src/exact/node.cuh
Normal file
158
plugin/updater_gpu/src/exact/node.cuh
Normal file
@@ -0,0 +1,158 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "gradients.cuh"
|
||||
#include "../common.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
/**
|
||||
* @enum DefaultDirection node.cuh
|
||||
* @brief Default direction to be followed in case of missing values
|
||||
*/
|
||||
enum DefaultDirection {
|
||||
/** move to left child */
|
||||
LeftDir = 0,
|
||||
/** move to right child */
|
||||
RightDir
|
||||
};
|
||||
|
||||
|
||||
/** used to assign default id to a Node */
|
||||
static const int UNUSED_NODE = -1;
|
||||
|
||||
|
||||
/**
|
||||
* @struct Split node.cuh
|
||||
* @brief Abstraction of a possible split in the decision tree
|
||||
*/
|
||||
struct Split {
|
||||
/** the optimal gain score for this node */
|
||||
float score;
|
||||
/** index where to split in the DMatrix */
|
||||
int index;
|
||||
|
||||
HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
|
||||
|
||||
/**
|
||||
* @brief Whether the split info is valid to be used to create a new child
|
||||
* @param minSplitLoss minimum score above which decision to split is made
|
||||
* @return true if splittable, else false
|
||||
*/
|
||||
HOST_DEV_INLINE bool isSplittable(float minSplitLoss) const {
|
||||
return ((score >= minSplitLoss) && (index != INT_MAX));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @struct Node node.cuh
|
||||
* @brief Abstraction of a node in the decision tree
|
||||
*/
|
||||
template <typename node_id_t>
|
||||
class Node {
|
||||
public:
|
||||
/** sum of gradients across all training samples part of this node */
|
||||
gpu_gpair gradSum;
|
||||
/** the optimal score for this node */
|
||||
float score;
|
||||
/** weightage for this node */
|
||||
float weight;
|
||||
/** default direction for missing values */
|
||||
DefaultDirection dir;
|
||||
/** threshold value for comparison */
|
||||
float threshold;
|
||||
/** column (feature) index whose value needs to be compared in this node */
|
||||
int colIdx;
|
||||
/** node id (used as key for reduce/scan) */
|
||||
node_id_t id;
|
||||
|
||||
HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
|
||||
dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
|
||||
id(UNUSED_NODE) {}
|
||||
|
||||
/** Tells whether this node is part of the decision tree */
|
||||
HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
|
||||
|
||||
/** Tells whether this node is a leaf of the decision tree */
|
||||
HOST_DEV_INLINE bool isLeaf() const {
|
||||
return (!isUnused() && (score == -FLT_MAX));
|
||||
}
|
||||
|
||||
/** Tells whether default direction is left child or not */
|
||||
HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @struct Segment node.cuh
|
||||
* @brief Space inefficient, but super easy to implement structure to define
|
||||
* the start and end of a segment in the input array
|
||||
*/
|
||||
struct Segment {
|
||||
/** start index of the segment */
|
||||
int start;
|
||||
/** end index of the segment */
|
||||
int end;
|
||||
|
||||
HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
|
||||
|
||||
/** Checks whether the current structure defines a valid segment */
|
||||
HOST_DEV_INLINE bool isValid() const {
|
||||
return !((start == -1) || (end == -1));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @enum NodeType node.cuh
|
||||
* @brief Useful to decribe the node type in a dense BFS-order tree array
|
||||
*/
|
||||
enum NodeType {
|
||||
/** a non-leaf node */
|
||||
NODE = 0,
|
||||
/** leaf node */
|
||||
LEAF,
|
||||
/** unused node */
|
||||
UNUSED
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Absolute BFS order IDs to col-wise unique IDs based on user input
|
||||
* @param tid the index of the element that this thread should access
|
||||
* @param abs the array of absolute IDs
|
||||
* @param colIds the array of column IDs for each element
|
||||
* @param nodeStart the start of the node ID at this level
|
||||
* @param nKeys number of nodes at this level.
|
||||
* @return the uniq key
|
||||
*/
|
||||
template <typename node_id_t>
|
||||
HOST_DEV_INLINE int abs2uniqKey(int tid, const node_id_t* abs,
|
||||
const int* colIds, node_id_t nodeStart,
|
||||
int nKeys) {
|
||||
int a = abs[tid];
|
||||
if (a == UNUSED_NODE) return a;
|
||||
return ((a - nodeStart) + (colIds[tid] * nKeys));
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
150
plugin/updater_gpu/src/exact/split2node.cuh
Normal file
150
plugin/updater_gpu/src/exact/split2node.cuh
Normal file
@@ -0,0 +1,150 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "gradients.cuh"
|
||||
#include "node.cuh"
|
||||
#include "loss_functions.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
/**
|
||||
* @brief Helper function to update the child node based on the current status
|
||||
* of its parent node
|
||||
* @param nodes the nodes array in which the position at 'nid' will be updated
|
||||
* @param nid the nodeId in the 'nodes' array corresponding to this child node
|
||||
* @param grad gradient sum for this child node
|
||||
* @param minChildWeight minimum child weight for the split
|
||||
* @param alpha L1 regularizer for weight updates
|
||||
* @param lambda lambda as in xgboost
|
||||
* @param maxStep max weight step update
|
||||
*/
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
||||
const gpu_gpair& grad,
|
||||
const TrainParam ¶m) {
|
||||
nodes[nid].gradSum = grad;
|
||||
nodes[nid].score = CalcGain(param, grad.g, grad.h);
|
||||
nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
|
||||
nodes[nid].id = nid;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Helper function to update the child nodes based on the current status
|
||||
* of their parent node
|
||||
* @param nodes the nodes array in which the position at 'nid' will be updated
|
||||
* @param pid the nodeId of the parent
|
||||
* @param gradL gradient sum for the left child node
|
||||
* @param gradR gradient sum for the right child node
|
||||
* @param param the training parameter struct
|
||||
*/
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
|
||||
const gpu_gpair& gradL, const gpu_gpair& gradR,
|
||||
const TrainParam ¶m) {
|
||||
int childId = (pid * 2) + 1;
|
||||
updateOneChildNode(nodes, childId, gradL, param);
|
||||
updateOneChildNode(nodes, childId+1, gradR, param);
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
||||
const Node<node_id_t>& n, int absNodeId, int colId,
|
||||
const gpu_gpair& gradScan,
|
||||
const gpu_gpair& colSum, float thresh,
|
||||
const TrainParam ¶m) {
|
||||
bool missingLeft = true;
|
||||
// get the default direction for the current node
|
||||
gpu_gpair missing = n.gradSum - colSum;
|
||||
loss_chg_missing(gradScan, missing, n.gradSum, n.score, param, missingLeft);
|
||||
// get the score/weight/id/gradSum for left and right child nodes
|
||||
gpu_gpair lGradSum, rGradSum;
|
||||
if (missingLeft) {
|
||||
lGradSum = gradScan + n.gradSum - colSum;
|
||||
} else {
|
||||
lGradSum = gradScan;
|
||||
}
|
||||
rGradSum = n.gradSum - lGradSum;
|
||||
updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
|
||||
// update default-dir, threshold and feature id for current node
|
||||
nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
|
||||
nodes[absNodeId].colIdx = colId;
|
||||
nodes[absNodeId].threshold = thresh;
|
||||
}
|
||||
|
||||
template <typename node_id_t, int BLKDIM=256>
|
||||
__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals,
|
||||
const int* colIds, const int* colOffsets,
|
||||
const node_id_t* nodeAssigns, int nUniqKeys,
|
||||
node_id_t nodeStart, int nCols,
|
||||
const TrainParam param) {
|
||||
int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (uid >= nUniqKeys) {
|
||||
return;
|
||||
}
|
||||
int absNodeId = uid + nodeStart;
|
||||
Split s = nodeSplits[uid];
|
||||
if (s.isSplittable(param.min_split_loss)) {
|
||||
int idx = s.index;
|
||||
int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
|
||||
nUniqKeys);
|
||||
updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
|
||||
colIds[idx], gradScans[idx],
|
||||
gradSums[nodeInstId], vals[idx], param);
|
||||
} else {
|
||||
// cannot be split further, so this node is a leaf!
|
||||
nodes[absNodeId].score = -FLT_MAX;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief function to convert split information into node
|
||||
* @param nodes the output nodes
|
||||
* @param nodeSplits split information
|
||||
* @param gradScans scan of sorted gradients across columns
|
||||
* @param gradSums key-wise gradient reduction across columns
|
||||
* @param vals the feature values
|
||||
* @param colIds column indices for each element in the array
|
||||
* @param colOffsets column segment offsets
|
||||
* @param nodeAssigns node-id assignment to every feature value
|
||||
* @param nUniqKeys number of nodes that we are currently working on
|
||||
* @param nodeStart start offset of the nodes in the overall BFS tree
|
||||
* @param nCols number of columns
|
||||
* @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
|
||||
* @param param the training parameter struct
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM=256>
|
||||
void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const int* colOffsets, const node_id_t* nodeAssigns,
|
||||
int nUniqKeys, node_id_t nodeStart, int nCols,
|
||||
const TrainParam param) {
|
||||
int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
|
||||
split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
|
||||
vals, colIds, colOffsets, nodeAssigns,
|
||||
nUniqKeys, nodeStart, nCols,
|
||||
param);
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -1,110 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include <cub/cub.cuh>
|
||||
#include <xgboost/base.h>
|
||||
#include <vector>
|
||||
#include "device_helpers.cuh"
|
||||
#include "find_split_multiscan.cuh"
|
||||
#include "find_split_sorting.cuh"
|
||||
#include "gpu_data.cuh"
|
||||
#include "types.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
__global__ void
|
||||
reduce_split_candidates_kernel(Split *d_split_candidates, Node *d_current_nodes,
|
||||
Node *d_new_nodes, int n_current_nodes,
|
||||
int n_features, const GPUTrainingParam param) {
|
||||
int nid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (nid >= n_current_nodes) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Find best split for each node
|
||||
Split best;
|
||||
|
||||
for (int i = 0; i < n_features; i++) {
|
||||
best.Update(d_split_candidates[n_current_nodes * i + nid]);
|
||||
}
|
||||
|
||||
// Update current node
|
||||
d_current_nodes[nid].split = best;
|
||||
|
||||
// Generate new nodes
|
||||
d_new_nodes[nid * 2] =
|
||||
Node(best.left_sum,
|
||||
CalcGain(param, best.left_sum.grad(), best.left_sum.hess()),
|
||||
CalcWeight(param, best.left_sum.grad(), best.left_sum.hess()));
|
||||
d_new_nodes[nid * 2 + 1] =
|
||||
Node(best.right_sum,
|
||||
CalcGain(param, best.right_sum.grad(), best.right_sum.hess()),
|
||||
CalcWeight(param, best.right_sum.grad(), best.right_sum.hess()));
|
||||
}
|
||||
|
||||
void reduce_split_candidates(Split *d_split_candidates, Node *d_nodes,
|
||||
int level, int n_features,
|
||||
const GPUTrainingParam param) {
|
||||
// Current level nodes (before split)
|
||||
Node *d_current_nodes = d_nodes + (1 << (level)) - 1;
|
||||
// Next level nodes (after split)
|
||||
Node *d_new_nodes = d_nodes + (1 << (level + 1)) - 1;
|
||||
// Number of existing nodes on this level
|
||||
int n_current_nodes = 1 << level;
|
||||
|
||||
const int BLOCK_THREADS = 512;
|
||||
const int GRID_SIZE = dh::div_round_up(n_current_nodes, BLOCK_THREADS);
|
||||
|
||||
reduce_split_candidates_kernel<<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||
d_split_candidates, d_current_nodes, d_new_nodes, n_current_nodes,
|
||||
n_features, param);
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void colsample_level(GPUData *data, const TrainParam xgboost_param,
|
||||
const std::vector<int> &feature_set_tree,
|
||||
std::vector<int> *feature_set_level) {
|
||||
unsigned n_bytree =
|
||||
static_cast<unsigned>(xgboost_param.colsample_bytree * data->n_features);
|
||||
unsigned n =
|
||||
static_cast<unsigned>(n_bytree * xgboost_param.colsample_bylevel);
|
||||
CHECK_GT(n, 0);
|
||||
|
||||
*feature_set_level = feature_set_tree;
|
||||
|
||||
std::shuffle((*feature_set_level).begin(),
|
||||
(*feature_set_level).begin() + n_bytree, common::GlobalRandom());
|
||||
|
||||
data->feature_set = *feature_set_level;
|
||||
|
||||
data->feature_flags.fill(0);
|
||||
auto d_feature_set = data->feature_set.data();
|
||||
auto d_feature_flags = data->feature_flags.data();
|
||||
|
||||
dh::launch_n(
|
||||
n, [=] __device__(int i) { d_feature_flags[d_feature_set[i]] = 1; });
|
||||
}
|
||||
|
||||
void find_split(GPUData *data, const TrainParam xgboost_param, const int level,
|
||||
bool multiscan_algorithm,
|
||||
const std::vector<int> &feature_set_tree,
|
||||
std::vector<int> *feature_set_level) {
|
||||
colsample_level(data, xgboost_param, feature_set_tree, feature_set_level);
|
||||
// Reset split candidates
|
||||
data->split_candidates.fill(Split());
|
||||
|
||||
if (multiscan_algorithm) {
|
||||
find_split_candidates_multiscan(data, level);
|
||||
} else {
|
||||
find_split_candidates_sorted(data, level);
|
||||
}
|
||||
|
||||
// Find the best split for each node
|
||||
reduce_split_candidates(data->split_candidates.data(), data->nodes.data(),
|
||||
level, data->n_features, data->param);
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -1,723 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include <cub/cub.cuh>
|
||||
#include <xgboost/base.h>
|
||||
#include "device_helpers.cuh"
|
||||
#include "gpu_data.cuh"
|
||||
#include "types.cuh"
|
||||
#include "common.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
typedef uint64_t BitFlagSet;
|
||||
|
||||
__device__ __inline__ void set_bit(BitFlagSet &bf, int index) { // NOLINT
|
||||
bf |= 1 << index;
|
||||
}
|
||||
|
||||
__device__ __inline__ bool check_bit(BitFlagSet bf, int index) {
|
||||
return (bf >> index) & 1;
|
||||
}
|
||||
|
||||
// Carryover prefix for scanning multiple tiles of bit flags
|
||||
struct FlagPrefixCallbackOp {
|
||||
BitFlagSet tile_carry;
|
||||
|
||||
__device__ FlagPrefixCallbackOp() : tile_carry(0) {}
|
||||
|
||||
__device__ BitFlagSet operator()(BitFlagSet block_aggregate) {
|
||||
BitFlagSet old_prefix = tile_carry;
|
||||
tile_carry |= block_aggregate;
|
||||
return old_prefix;
|
||||
}
|
||||
};
|
||||
|
||||
// Scan op for bit flags that resets if the final bit is set
|
||||
struct FlagScanOp {
|
||||
__device__ __forceinline__ BitFlagSet operator()(const BitFlagSet &a,
|
||||
const BitFlagSet &b) {
|
||||
if (check_bit(b, 63)) {
|
||||
return b;
|
||||
} else {
|
||||
return a | b;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
|
||||
struct FindSplitParamsMultiscan {
|
||||
enum {
|
||||
BLOCK_THREADS = _BLOCK_THREADS,
|
||||
TILE_ITEMS = BLOCK_THREADS,
|
||||
N_NODES = _N_NODES,
|
||||
N_WARPS = _BLOCK_THREADS / 32,
|
||||
DEBUG_VALIDATE = _DEBUG_VALIDATE,
|
||||
ITEMS_PER_THREAD = 1
|
||||
};
|
||||
};
|
||||
|
||||
template <int _BLOCK_THREADS, int _N_NODES, bool _DEBUG_VALIDATE>
|
||||
struct ReduceParamsMultiscan {
|
||||
enum {
|
||||
BLOCK_THREADS = _BLOCK_THREADS,
|
||||
ITEMS_PER_THREAD = 1,
|
||||
TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
|
||||
N_NODES = _N_NODES,
|
||||
N_WARPS = _BLOCK_THREADS / 32,
|
||||
DEBUG_VALIDATE = _DEBUG_VALIDATE
|
||||
};
|
||||
};
|
||||
|
||||
template <typename ParamsT> struct ReduceEnactorMultiscan {
|
||||
typedef cub::WarpReduce<gpu_gpair> WarpReduceT;
|
||||
|
||||
struct _TempStorage {
|
||||
typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
|
||||
gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
|
||||
};
|
||||
|
||||
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
||||
|
||||
struct _Reduction {
|
||||
gpu_gpair node_sums[ParamsT::N_NODES];
|
||||
};
|
||||
|
||||
struct Reduction : cub::Uninitialized<_Reduction> {};
|
||||
|
||||
// Thread local member variables
|
||||
const ItemIter item_iter;
|
||||
_TempStorage &temp_storage;
|
||||
_Reduction &reduction;
|
||||
gpu_gpair gpair;
|
||||
NodeIdT node_id;
|
||||
NodeIdT node_id_adjusted;
|
||||
const int node_begin;
|
||||
|
||||
__device__ __forceinline__
|
||||
ReduceEnactorMultiscan(TempStorage &temp_storage, // NOLINT
|
||||
Reduction &reduction, // NOLINT
|
||||
const ItemIter item_iter, const int node_begin)
|
||||
: temp_storage(temp_storage.Alias()), reduction(reduction.Alias()),
|
||||
item_iter(item_iter), node_begin(node_begin) {}
|
||||
|
||||
__device__ __forceinline__ void ResetPartials() {
|
||||
if (threadIdx.x < ParamsT::N_WARPS) {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ResetReduction() {
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
reduction.node_sums[threadIdx.x] = gpu_gpair();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void LoadTile(const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
if (threadIdx.x < num_remaining) {
|
||||
bst_uint i = offset + threadIdx.x;
|
||||
gpair = thrust::get<0>(item_iter[i]);
|
||||
// gpair = d_items[offset + threadIdx.x].gpair;
|
||||
// node_id = d_node_id[offset + threadIdx.x];
|
||||
node_id = thrust::get<2>(item_iter[i]);
|
||||
node_id_adjusted = node_id - node_begin;
|
||||
} else {
|
||||
gpair = gpu_gpair();
|
||||
node_id = -1;
|
||||
node_id_adjusted = -1;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessTile(const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
LoadTile(offset, num_remaining);
|
||||
|
||||
// Warp synchronous reduction
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
bool active = node_id_adjusted == NODE;
|
||||
|
||||
unsigned int ballot = __ballot(active);
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
|
||||
if (ballot == 0) {
|
||||
continue;
|
||||
} else if (__popc(ballot) == 1) {
|
||||
if (active) {
|
||||
temp_storage.partial_sums[NODE][warp_id] += gpair;
|
||||
}
|
||||
} else {
|
||||
gpu_gpair sum = WarpReduceT(temp_storage.warp_reduce[warp_id])
|
||||
.Sum(active ? gpair : gpu_gpair());
|
||||
if (lane_id == 0) {
|
||||
temp_storage.partial_sums[NODE][warp_id] += sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ReducePartials() {
|
||||
// Use single warp to reduce partials
|
||||
if (threadIdx.x < 32) {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
gpu_gpair sum =
|
||||
WarpReduceT(temp_storage.warp_reduce[0])
|
||||
.Sum(threadIdx.x < ParamsT::N_WARPS
|
||||
? temp_storage.partial_sums[NODE][threadIdx.x]
|
||||
: gpu_gpair());
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
reduction.node_sums[NODE] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
||||
const bst_uint &segment_end) {
|
||||
// Current position
|
||||
bst_uint offset = segment_begin;
|
||||
|
||||
ResetReduction();
|
||||
ResetPartials();
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Process full tiles
|
||||
while (offset < segment_end) {
|
||||
ProcessTile(offset, segment_end - offset);
|
||||
offset += ParamsT::TILE_ITEMS;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
ReducePartials();
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ParamsT, typename ReductionT>
|
||||
struct FindSplitEnactorMultiscan {
|
||||
typedef cub::BlockScan<BitFlagSet, ParamsT::BLOCK_THREADS> FlagsBlockScanT;
|
||||
|
||||
typedef cub::WarpReduce<Split> WarpSplitReduceT;
|
||||
|
||||
typedef cub::WarpReduce<float> WarpReduceT;
|
||||
|
||||
typedef cub::WarpScan<gpu_gpair> WarpScanT;
|
||||
|
||||
struct _TempStorage {
|
||||
union {
|
||||
typename WarpSplitReduceT::TempStorage warp_split_reduce;
|
||||
typename FlagsBlockScanT::TempStorage flags_scan;
|
||||
typename WarpScanT::TempStorage warp_gpair_scan[ParamsT::N_WARPS];
|
||||
typename WarpReduceT::TempStorage warp_reduce[ParamsT::N_WARPS];
|
||||
};
|
||||
|
||||
Split warp_best_splits[ParamsT::N_NODES][ParamsT::N_WARPS];
|
||||
gpu_gpair partial_sums[ParamsT::N_NODES][ParamsT::N_WARPS];
|
||||
gpu_gpair top_level_sum[ParamsT::N_NODES]; // Sum of current partial sums
|
||||
gpu_gpair tile_carry[ParamsT::N_NODES]; // Contains top-level sums from
|
||||
// previous tiles
|
||||
Split best_splits[ParamsT::N_NODES];
|
||||
// Cache current level nodes into shared memory
|
||||
float node_root_gain[ParamsT::N_NODES];
|
||||
gpu_gpair node_parent_sum[ParamsT::N_NODES];
|
||||
};
|
||||
|
||||
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
||||
|
||||
// Thread local member variables
|
||||
const ItemIter item_iter;
|
||||
Split *d_split_candidates_out;
|
||||
const Node *d_nodes;
|
||||
_TempStorage &temp_storage;
|
||||
gpu_gpair gpair;
|
||||
float fvalue;
|
||||
NodeIdT node_id;
|
||||
NodeIdT node_id_adjusted;
|
||||
const NodeIdT node_begin;
|
||||
const GPUTrainingParam ¶m;
|
||||
const ReductionT &reduction;
|
||||
const int level;
|
||||
FlagPrefixCallbackOp flag_prefix_op;
|
||||
|
||||
__device__ __forceinline__ FindSplitEnactorMultiscan(
|
||||
TempStorage &temp_storage, const ItemIter item_iter, // NOLINT
|
||||
Split *d_split_candidates_out, const Node *d_nodes,
|
||||
const NodeIdT node_begin, const GPUTrainingParam ¶m,
|
||||
const ReductionT reduction, const int level)
|
||||
: temp_storage(temp_storage.Alias()), item_iter(item_iter),
|
||||
d_split_candidates_out(d_split_candidates_out), d_nodes(d_nodes),
|
||||
node_begin(node_begin), param(param), reduction(reduction),
|
||||
level(level), flag_prefix_op() {}
|
||||
|
||||
__device__ __forceinline__ void UpdateTileCarry() {
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
temp_storage.tile_carry[threadIdx.x] +=
|
||||
temp_storage.top_level_sum[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ResetTileCarry() {
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
temp_storage.tile_carry[threadIdx.x] = gpu_gpair();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ResetPartials() {
|
||||
if (threadIdx.x < ParamsT::N_WARPS) {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
temp_storage.partial_sums[NODE][threadIdx.x] = gpu_gpair();
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
temp_storage.top_level_sum[threadIdx.x] = gpu_gpair();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ResetSplits() {
|
||||
if (threadIdx.x < ParamsT::N_WARPS) {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
temp_storage.warp_best_splits[NODE][threadIdx.x] = Split();
|
||||
}
|
||||
}
|
||||
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
temp_storage.best_splits[threadIdx.x] = Split();
|
||||
}
|
||||
}
|
||||
|
||||
// Cache d_nodes array for this level into shared memory
|
||||
__device__ __forceinline__ void CacheNodes() {
|
||||
// Get pointer to nodes on the current level
|
||||
const Node *d_nodes_level = d_nodes + node_begin;
|
||||
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
temp_storage.node_root_gain[threadIdx.x] =
|
||||
d_nodes_level[threadIdx.x].root_gain;
|
||||
temp_storage.node_parent_sum[threadIdx.x] =
|
||||
d_nodes_level[threadIdx.x].sum_gradients;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void LoadTile(bst_uint offset,
|
||||
bst_uint num_remaining) {
|
||||
if (threadIdx.x < num_remaining) {
|
||||
bst_uint i = offset + threadIdx.x;
|
||||
gpair = thrust::get<0>(item_iter[i]);
|
||||
fvalue = thrust::get<1>(item_iter[i]);
|
||||
node_id = thrust::get<2>(item_iter[i]);
|
||||
node_id_adjusted = node_id - node_begin;
|
||||
} else {
|
||||
node_id = -1;
|
||||
node_id_adjusted = -1;
|
||||
fvalue = -FLT_MAX;
|
||||
gpair = gpu_gpair();
|
||||
}
|
||||
}
|
||||
|
||||
// Is this node being processed by current kernel iteration?
|
||||
__device__ __forceinline__ bool NodeActive() {
|
||||
return node_id_adjusted < ParamsT::N_NODES && node_id_adjusted >= 0;
|
||||
}
|
||||
|
||||
// Is current fvalue different from left fvalue
|
||||
__device__ __forceinline__ bool
|
||||
LeftMostFvalue(const bst_uint &segment_begin, const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
int left_index = offset + threadIdx.x - 1;
|
||||
float left_fvalue = left_index >= static_cast<int>(segment_begin) &&
|
||||
threadIdx.x < num_remaining
|
||||
? thrust::get<1>(item_iter[left_index])
|
||||
: -FLT_MAX;
|
||||
|
||||
return left_fvalue != fvalue;
|
||||
}
|
||||
|
||||
// Prevent splitting in the middle of same valued instances
|
||||
__device__ __forceinline__ bool
|
||||
CheckSplitValid(const bst_uint &segment_begin, const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
BitFlagSet bit_flag = 0;
|
||||
|
||||
bool valid_split = false;
|
||||
|
||||
if (LeftMostFvalue(segment_begin, offset, num_remaining)) {
|
||||
valid_split = true;
|
||||
// Use MSB bit to flag if fvalue is leftmost
|
||||
set_bit(bit_flag, 63);
|
||||
}
|
||||
|
||||
// Flag nodeid
|
||||
if (NodeActive()) {
|
||||
set_bit(bit_flag, node_id_adjusted);
|
||||
}
|
||||
|
||||
FlagsBlockScanT(temp_storage.flags_scan)
|
||||
.ExclusiveScan(bit_flag, bit_flag, FlagScanOp(), flag_prefix_op);
|
||||
__syncthreads();
|
||||
|
||||
if (!valid_split && NodeActive()) {
|
||||
if (!check_bit(bit_flag, node_id_adjusted)) {
|
||||
valid_split = true;
|
||||
}
|
||||
}
|
||||
|
||||
return valid_split;
|
||||
}
|
||||
|
||||
// Perform warp reduction to find if this lane contains best loss_chg in warp
|
||||
__device__ __forceinline__ bool QueryLaneBestLoss(const float &loss_chg) {
|
||||
int lane_id = threadIdx.x % 32;
|
||||
int warp_id = threadIdx.x / 32;
|
||||
|
||||
// Possible source of bugs. Not all threads in warp are active here. Not
|
||||
// sure if reduce function will behave correctly
|
||||
float best = WarpReduceT(temp_storage.warp_reduce[warp_id])
|
||||
.Reduce(loss_chg, cub::Max());
|
||||
|
||||
// Its possible for more than one lane to contain the best value, so make
|
||||
// sure only one lane returns true
|
||||
unsigned int ballot = __ballot(loss_chg == best);
|
||||
|
||||
if (lane_id == (__ffs(ballot) - 1)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Which thread in this warp should update the current best split, if any
|
||||
// Returns true for one thread or none
|
||||
__device__ __forceinline__ bool
|
||||
QueryUpdateWarpSplit(const float &loss_chg,
|
||||
volatile const float &warp_best_loss) {
|
||||
bool update = false;
|
||||
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
bool active = node_id_adjusted == NODE;
|
||||
|
||||
unsigned int ballot = __ballot(loss_chg > warp_best_loss && active);
|
||||
|
||||
// No lane has improved loss_chg
|
||||
if (__popc(ballot) == 0) {
|
||||
continue;
|
||||
} else if (__popc(ballot) == 1) {
|
||||
// A single lane has improved loss_chg, set true for this lane
|
||||
int lane_id = threadIdx.x % 32;
|
||||
|
||||
if (lane_id == __ffs(ballot) - 1) {
|
||||
update = true;
|
||||
}
|
||||
} else {
|
||||
// More than one lane has improved loss_chg, perform a reduction.
|
||||
if (QueryLaneBestLoss(active ? loss_chg : -FLT_MAX)) {
|
||||
update = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return update;
|
||||
}
|
||||
|
||||
__device__ void PrintTileScan(int block_id, bool thread_active,
|
||||
float loss_chg, gpu_gpair missing) {
|
||||
if (blockIdx.x != block_id) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int warp = 0; warp < ParamsT::N_WARPS; warp++) {
|
||||
if (threadIdx.x / 32 == warp) {
|
||||
for (int lane = 0; lane < 32; lane++) {
|
||||
gpu_gpair g = cub::ShuffleIndex(gpair, lane);
|
||||
gpu_gpair missing_broadcast = cub::ShuffleIndex(missing, lane);
|
||||
float fvalue_broadcast = __shfl(fvalue, lane);
|
||||
bool thread_active_broadcast = __shfl(thread_active, lane);
|
||||
float loss_chg_broadcast = __shfl(loss_chg, lane);
|
||||
NodeIdT node_id_broadcast = cub::ShuffleIndex(node_id, lane);
|
||||
if (threadIdx.x == 32 * warp) {
|
||||
printf("tid %d, nid %d, fvalue %1.2f, active %c, loss %1.2f, scan ",
|
||||
threadIdx.x + lane, node_id_broadcast, fvalue_broadcast,
|
||||
thread_active_broadcast ? 'y' : 'n',
|
||||
loss_chg_broadcast < 0.0f ? 0 : loss_chg_broadcast);
|
||||
g.print();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void
|
||||
EvaluateSplits(const bst_uint &segment_begin, const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
bool valid_split = CheckSplitValid(segment_begin, offset, num_remaining);
|
||||
|
||||
bool thread_active =
|
||||
NodeActive() && valid_split && threadIdx.x < num_remaining;
|
||||
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
|
||||
gpu_gpair parent_sum = thread_active
|
||||
? temp_storage.node_parent_sum[node_id_adjusted]
|
||||
: gpu_gpair();
|
||||
float parent_gain =
|
||||
thread_active ? temp_storage.node_root_gain[node_id_adjusted] : 0.0f;
|
||||
gpu_gpair missing = thread_active
|
||||
? parent_sum - reduction.node_sums[node_id_adjusted]
|
||||
: gpu_gpair();
|
||||
|
||||
bool missing_left;
|
||||
|
||||
float loss_chg = thread_active
|
||||
? loss_chg_missing(gpair, missing, parent_sum,
|
||||
parent_gain, param, missing_left)
|
||||
: -FLT_MAX;
|
||||
|
||||
// PrintTileScan(64, thread_active, loss_chg, missing);
|
||||
|
||||
float warp_best_loss =
|
||||
thread_active
|
||||
? temp_storage.warp_best_splits[node_id_adjusted][warp_id].loss_chg
|
||||
: 0.0f;
|
||||
|
||||
if (QueryUpdateWarpSplit(loss_chg, warp_best_loss)) {
|
||||
float fvalue_split = fvalue - FVALUE_EPS;
|
||||
|
||||
if (missing_left) {
|
||||
gpu_gpair left_sum = missing + gpair;
|
||||
gpu_gpair right_sum = parent_sum - left_sum;
|
||||
temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
|
||||
loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
|
||||
right_sum, param);
|
||||
} else {
|
||||
gpu_gpair left_sum = gpair;
|
||||
gpu_gpair right_sum = parent_sum - left_sum;
|
||||
temp_storage.warp_best_splits[node_id_adjusted][warp_id].Update(
|
||||
loss_chg, missing_left, fvalue_split, blockIdx.x, left_sum,
|
||||
right_sum, param);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void BlockExclusiveScan() {
|
||||
ResetPartials();
|
||||
|
||||
__syncthreads();
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int lane_id = threadIdx.x % 32;
|
||||
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
bool node_active = node_id_adjusted == NODE;
|
||||
|
||||
unsigned int ballot = __ballot(node_active);
|
||||
|
||||
gpu_gpair warp_sum = gpu_gpair();
|
||||
gpu_gpair scan_result = gpu_gpair();
|
||||
|
||||
if (ballot > 0) {
|
||||
WarpScanT(temp_storage.warp_gpair_scan[warp_id])
|
||||
.InclusiveScan(node_active ? gpair : gpu_gpair(), scan_result,
|
||||
cub::Sum(), warp_sum);
|
||||
}
|
||||
|
||||
if (node_active) {
|
||||
gpair = scan_result - gpair;
|
||||
}
|
||||
|
||||
if (lane_id == 0) {
|
||||
temp_storage.partial_sums[NODE][warp_id] = warp_sum;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < 32) {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
gpu_gpair top_level_sum;
|
||||
bool warp_active = threadIdx.x < ParamsT::N_WARPS;
|
||||
gpu_gpair scan_result;
|
||||
WarpScanT(temp_storage.warp_gpair_scan[warp_id])
|
||||
.InclusiveScan(warp_active
|
||||
? temp_storage.partial_sums[NODE][threadIdx.x]
|
||||
: gpu_gpair(),
|
||||
scan_result, cub::Sum(), top_level_sum);
|
||||
|
||||
if (warp_active) {
|
||||
temp_storage.partial_sums[NODE][threadIdx.x] =
|
||||
scan_result - temp_storage.partial_sums[NODE][threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
temp_storage.top_level_sum[NODE] = top_level_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (NodeActive()) {
|
||||
gpair += temp_storage.partial_sums[node_id_adjusted][warp_id] +
|
||||
temp_storage.tile_carry[node_id_adjusted];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
UpdateTileCarry();
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessTile(const bst_uint &segment_begin,
|
||||
const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
LoadTile(offset, num_remaining);
|
||||
BlockExclusiveScan();
|
||||
EvaluateSplits(segment_begin, offset, num_remaining);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ReduceSplits() {
|
||||
for (int NODE = 0; NODE < ParamsT::N_NODES; NODE++) {
|
||||
if (threadIdx.x < 32) {
|
||||
Split s = Split();
|
||||
if (threadIdx.x < ParamsT::N_WARPS) {
|
||||
s = temp_storage.warp_best_splits[NODE][threadIdx.x];
|
||||
}
|
||||
Split best = WarpSplitReduceT(temp_storage.warp_split_reduce)
|
||||
.Reduce(s, split_reduce_op());
|
||||
if (threadIdx.x == 0) {
|
||||
temp_storage.best_splits[NODE] = best;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WriteBestSplits() {
|
||||
const int nodes_level = 1 << level;
|
||||
|
||||
if (threadIdx.x < ParamsT::N_NODES) {
|
||||
d_split_candidates_out[blockIdx.x * nodes_level + threadIdx.x] =
|
||||
temp_storage.best_splits[threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
||||
const bst_uint &segment_end) {
|
||||
// Current position
|
||||
bst_uint offset = segment_begin;
|
||||
|
||||
ResetTileCarry();
|
||||
ResetSplits();
|
||||
CacheNodes();
|
||||
__syncthreads();
|
||||
|
||||
// Process full tiles
|
||||
while (offset < segment_end) {
|
||||
ProcessTile(segment_begin, offset, segment_end - offset);
|
||||
__syncthreads();
|
||||
offset += ParamsT::TILE_ITEMS;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
ReduceSplits();
|
||||
|
||||
__syncthreads();
|
||||
WriteBestSplits();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FindSplitParamsT, typename ReduceParamsT>
|
||||
__global__ void
|
||||
#if __CUDA_ARCH__ <= 530
|
||||
__launch_bounds__(1024, 2)
|
||||
#endif
|
||||
find_split_candidates_multiscan_kernel(
|
||||
const ItemIter items_iter, Split *d_split_candidates_out,
|
||||
const Node *d_nodes, const int node_begin, bst_uint num_items,
|
||||
int num_features, const int *d_feature_offsets,
|
||||
const GPUTrainingParam param, const int *d_feature_flags,
|
||||
const int level) {
|
||||
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
int segment_begin = d_feature_offsets[blockIdx.x];
|
||||
int segment_end = d_feature_offsets[blockIdx.x + 1];
|
||||
|
||||
typedef ReduceEnactorMultiscan<ReduceParamsT> ReduceT;
|
||||
typedef FindSplitEnactorMultiscan<FindSplitParamsT,
|
||||
typename ReduceT::_Reduction>
|
||||
FindSplitT;
|
||||
|
||||
__shared__ union {
|
||||
typename ReduceT::TempStorage reduce;
|
||||
typename FindSplitT::TempStorage find_split;
|
||||
} temp_storage;
|
||||
|
||||
__shared__ typename ReduceT::Reduction reduction;
|
||||
|
||||
ReduceT(temp_storage.reduce, reduction, items_iter, node_begin)
|
||||
.ProcessRegion(segment_begin, segment_end);
|
||||
__syncthreads();
|
||||
|
||||
FindSplitT find_split(temp_storage.find_split, items_iter,
|
||||
d_split_candidates_out, d_nodes, node_begin, param,
|
||||
reduction.Alias(), level);
|
||||
find_split.ProcessRegion(segment_begin, segment_end);
|
||||
}
|
||||
|
||||
template <int N_NODES>
|
||||
void find_split_candidates_multiscan_variation(GPUData *data, const int level) {
|
||||
const int node_begin = (1 << level) - 1;
|
||||
const int BLOCK_THREADS = 512;
|
||||
|
||||
CHECK(BLOCK_THREADS / 32 < 32)
|
||||
<< "Too many active warps. See FindSplitEnactor - ReduceSplits.";
|
||||
|
||||
typedef FindSplitParamsMultiscan<BLOCK_THREADS, N_NODES, false>
|
||||
find_split_params;
|
||||
typedef ReduceParamsMultiscan<BLOCK_THREADS, N_NODES, false> reduce_params;
|
||||
int grid_size = data->n_features;
|
||||
|
||||
find_split_candidates_multiscan_kernel<
|
||||
find_split_params,
|
||||
reduce_params><<<grid_size, find_split_params::BLOCK_THREADS>>>(
|
||||
data->items_iter, data->split_candidates.data(), data->nodes.data(),
|
||||
node_begin, data->fvalues.size(), data->n_features, data->foffsets.data(),
|
||||
data->param, data->feature_flags.data(), level);
|
||||
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
void find_split_candidates_multiscan(GPUData *data, const int level) {
|
||||
// Select templated variation of split finding algorithm
|
||||
switch (level) {
|
||||
case 0:
|
||||
find_split_candidates_multiscan_variation<1>(data, level);
|
||||
break;
|
||||
case 1:
|
||||
find_split_candidates_multiscan_variation<2>(data, level);
|
||||
break;
|
||||
case 2:
|
||||
find_split_candidates_multiscan_variation<4>(data, level);
|
||||
break;
|
||||
case 3:
|
||||
find_split_candidates_multiscan_variation<8>(data, level);
|
||||
break;
|
||||
case 4:
|
||||
find_split_candidates_multiscan_variation<16>(data, level);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -1,414 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/base.h>
|
||||
#include <cub/cub.cuh>
|
||||
#include "common.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "types.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
|
||||
struct ScanTuple {
|
||||
gpu_gpair gpair;
|
||||
NodeIdT node_id;
|
||||
|
||||
__device__ ScanTuple() {}
|
||||
|
||||
__device__ ScanTuple(gpu_gpair gpair, NodeIdT node_id)
|
||||
: gpair(gpair), node_id(node_id) {}
|
||||
|
||||
__device__ ScanTuple operator+=(const ScanTuple &rhs) {
|
||||
if (node_id != rhs.node_id) {
|
||||
*this = rhs;
|
||||
return *this;
|
||||
} else {
|
||||
gpair += rhs.gpair;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
__device__ ScanTuple operator+(const ScanTuple &rhs) const {
|
||||
ScanTuple t = *this;
|
||||
return t += rhs;
|
||||
}
|
||||
};
|
||||
|
||||
struct GpairTupleCallbackOp {
|
||||
// Running prefix
|
||||
ScanTuple running_total;
|
||||
// Constructor
|
||||
__device__ GpairTupleCallbackOp()
|
||||
: running_total(ScanTuple(gpu_gpair(), -1)) {}
|
||||
__device__ ScanTuple operator()(ScanTuple block_aggregate) {
|
||||
ScanTuple old_prefix = running_total;
|
||||
running_total += block_aggregate;
|
||||
return old_prefix;
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
struct ReduceEnactorSorting {
|
||||
typedef cub::BlockScan<ScanTuple, BLOCK_THREADS> GpairScanT;
|
||||
struct _TempStorage {
|
||||
typename GpairScanT::TempStorage gpair_scan;
|
||||
};
|
||||
|
||||
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
||||
|
||||
// Thread local member variables
|
||||
gpu_gpair *d_block_node_sums;
|
||||
int *d_block_node_offsets;
|
||||
const ItemIter item_iter;
|
||||
_TempStorage &temp_storage;
|
||||
gpu_gpair gpair;
|
||||
NodeIdT node_id;
|
||||
NodeIdT right_node_id;
|
||||
// Contains node_id relative to the current level only
|
||||
NodeIdT node_id_adjusted;
|
||||
GpairTupleCallbackOp callback_op;
|
||||
const int level;
|
||||
|
||||
__device__ __forceinline__
|
||||
ReduceEnactorSorting(TempStorage &temp_storage, // NOLINT
|
||||
gpu_gpair *d_block_node_sums, int *d_block_node_offsets,
|
||||
ItemIter item_iter, const int level)
|
||||
: temp_storage(temp_storage.Alias()),
|
||||
d_block_node_sums(d_block_node_sums),
|
||||
d_block_node_offsets(d_block_node_offsets),
|
||||
item_iter(item_iter),
|
||||
callback_op(),
|
||||
level(level) {}
|
||||
|
||||
__device__ __forceinline__ void LoadTile(const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
if (threadIdx.x < num_remaining) {
|
||||
bst_uint i = offset + threadIdx.x;
|
||||
gpair = thrust::get<0>(item_iter[i]);
|
||||
node_id = thrust::get<2>(item_iter[i]);
|
||||
right_node_id = threadIdx.x == num_remaining - 1
|
||||
? -1
|
||||
: thrust::get<2>(item_iter[i + 1]);
|
||||
// Prevent overflow
|
||||
const int level_begin = (1 << level) - 1;
|
||||
node_id_adjusted =
|
||||
max(static_cast<int>(node_id) - level_begin, -1); // NOLINT
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessTile(const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
LoadTile(offset, num_remaining);
|
||||
|
||||
ScanTuple t(gpair, node_id);
|
||||
GpairScanT(temp_storage.gpair_scan).InclusiveSum(t, t, callback_op);
|
||||
__syncthreads();
|
||||
|
||||
// If tail of segment
|
||||
if (node_id != right_node_id && node_id_adjusted >= 0 &&
|
||||
threadIdx.x < num_remaining) {
|
||||
// Write sum
|
||||
d_block_node_sums[node_id_adjusted] = t.gpair;
|
||||
// Write offset
|
||||
d_block_node_offsets[node_id_adjusted] = offset + threadIdx.x + 1;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessRegion(const bst_uint &segment_begin,
|
||||
const bst_uint &segment_end) {
|
||||
const int max_nodes = 1 << level;
|
||||
dh::block_fill(d_block_node_offsets, max_nodes, -1);
|
||||
dh::block_fill(d_block_node_sums, max_nodes, gpu_gpair());
|
||||
|
||||
// Current position
|
||||
bst_uint offset = segment_begin;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Process full tiles
|
||||
while (offset < segment_end) {
|
||||
ProcessTile(offset, segment_end - offset);
|
||||
offset += BLOCK_THREADS;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_THREADS, int N_WARPS = BLOCK_THREADS / 32>
|
||||
struct FindSplitEnactorSorting {
|
||||
typedef cub::BlockScan<gpu_gpair, BLOCK_THREADS> GpairScanT;
|
||||
typedef cub::BlockReduce<Split, BLOCK_THREADS> SplitReduceT;
|
||||
typedef cub::WarpReduce<float> WarpLossReduceT;
|
||||
|
||||
struct _TempStorage {
|
||||
union {
|
||||
typename GpairScanT::TempStorage gpair_scan;
|
||||
typename SplitReduceT::TempStorage split_reduce;
|
||||
typename WarpLossReduceT::TempStorage loss_reduce[N_WARPS];
|
||||
};
|
||||
Split warp_best_splits[N_WARPS];
|
||||
};
|
||||
|
||||
struct TempStorage : cub::Uninitialized<_TempStorage> {};
|
||||
|
||||
// Thread local member variables
|
||||
_TempStorage &temp_storage;
|
||||
gpu_gpair *d_block_node_sums;
|
||||
int *d_block_node_offsets;
|
||||
const ItemIter item_iter;
|
||||
const Node *d_nodes;
|
||||
gpu_gpair gpair;
|
||||
float fvalue;
|
||||
NodeIdT node_id;
|
||||
float left_fvalue;
|
||||
const GPUTrainingParam ¶m;
|
||||
Split *d_split_candidates_out;
|
||||
const int level;
|
||||
|
||||
__device__ __forceinline__ FindSplitEnactorSorting(
|
||||
TempStorage &temp_storage, gpu_gpair *d_block_node_sums, // NOLINT
|
||||
int *d_block_node_offsets, const ItemIter item_iter, const Node *d_nodes,
|
||||
const GPUTrainingParam ¶m, Split *d_split_candidates_out,
|
||||
const int level)
|
||||
: temp_storage(temp_storage.Alias()),
|
||||
d_block_node_sums(d_block_node_sums),
|
||||
d_block_node_offsets(d_block_node_offsets),
|
||||
item_iter(item_iter),
|
||||
d_nodes(d_nodes),
|
||||
d_split_candidates_out(d_split_candidates_out),
|
||||
level(level),
|
||||
param(param) {}
|
||||
|
||||
__device__ __forceinline__ void LoadTile(NodeIdT node_id_adjusted,
|
||||
const bst_uint &node_begin,
|
||||
const bst_uint &offset,
|
||||
const bst_uint &num_remaining) {
|
||||
if (threadIdx.x < num_remaining) {
|
||||
bst_uint i = offset + threadIdx.x;
|
||||
gpair = thrust::get<0>(item_iter[i]);
|
||||
fvalue = thrust::get<1>(item_iter[i]);
|
||||
node_id = thrust::get<2>(item_iter[i]);
|
||||
bool first_item = offset + threadIdx.x == node_begin;
|
||||
left_fvalue =
|
||||
first_item ? fvalue - FVALUE_EPS : thrust::get<1>(item_iter[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void PrintTileScan(int block_id, bool thread_active,
|
||||
float loss_chg, gpu_gpair missing) {
|
||||
if (blockIdx.x != block_id) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int warp = 0; warp < N_WARPS; warp++) {
|
||||
if (threadIdx.x / 32 == warp) {
|
||||
for (int lane = 0; lane < 32; lane++) {
|
||||
gpu_gpair g = cub::ShuffleIndex(gpair, lane);
|
||||
gpu_gpair missing_broadcast = cub::ShuffleIndex(missing, lane);
|
||||
float fvalue_broadcast = __shfl(fvalue, lane);
|
||||
bool thread_active_broadcast = __shfl(thread_active, lane);
|
||||
float loss_chg_broadcast = __shfl(loss_chg, lane);
|
||||
if (threadIdx.x == 32 * warp) {
|
||||
printf("tid %d, fvalue %1.2f, active %c, loss %1.2f, scan ",
|
||||
threadIdx.x + lane, fvalue_broadcast,
|
||||
thread_active_broadcast ? 'y' : 'n',
|
||||
loss_chg_broadcast < 0.0f ? 0 : loss_chg_broadcast);
|
||||
g.print();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool QueryUpdateWarpSplit(float loss_chg,
|
||||
float warp_best_loss,
|
||||
bool thread_active) {
|
||||
int warp_id = threadIdx.x / 32;
|
||||
int ballot = __ballot(loss_chg > warp_best_loss && thread_active);
|
||||
if (ballot == 0) {
|
||||
return false;
|
||||
} else {
|
||||
// Warp reduce best loss
|
||||
float best = WarpLossReduceT(temp_storage.loss_reduce[warp_id])
|
||||
.Reduce(loss_chg, cub::Max());
|
||||
// Broadcast
|
||||
best = cub::ShuffleIndex(best, 0);
|
||||
|
||||
if (loss_chg == best) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ bool LeftmostFvalue() {
|
||||
return fvalue != left_fvalue;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void EvaluateSplits(
|
||||
const NodeIdT &node_id_adjusted, const bst_uint &node_begin,
|
||||
const bst_uint &offset, const bst_uint &num_remaining) {
|
||||
bool thread_active = LeftmostFvalue() && threadIdx.x < num_remaining &&
|
||||
node_id_adjusted >= 0 && node_id >= 0;
|
||||
|
||||
Node n = thread_active ? d_nodes[node_id] : Node();
|
||||
gpu_gpair missing =
|
||||
thread_active ? n.sum_gradients - d_block_node_sums[node_id_adjusted]
|
||||
: gpu_gpair();
|
||||
|
||||
bool missing_left;
|
||||
float loss_chg = thread_active
|
||||
? loss_chg_missing(gpair, missing, n.sum_gradients,
|
||||
n.root_gain, param, missing_left)
|
||||
: -FLT_MAX;
|
||||
|
||||
int warp_id = threadIdx.x / 32;
|
||||
volatile float warp_best_loss =
|
||||
temp_storage.warp_best_splits[warp_id].loss_chg;
|
||||
|
||||
if (QueryUpdateWarpSplit(loss_chg, warp_best_loss, thread_active)) {
|
||||
float fvalue_split = (fvalue + left_fvalue) / 2.0f;
|
||||
|
||||
gpu_gpair left_sum = gpair;
|
||||
if (missing_left) {
|
||||
left_sum += missing;
|
||||
}
|
||||
gpu_gpair right_sum = n.sum_gradients - left_sum;
|
||||
temp_storage.warp_best_splits[warp_id].Update(loss_chg, missing_left,
|
||||
fvalue_split, blockIdx.x,
|
||||
left_sum, right_sum, param);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessTile(
|
||||
const NodeIdT &node_id_adjusted, const bst_uint &node_begin,
|
||||
const bst_uint &offset, const bst_uint &num_remaining,
|
||||
GpairCallbackOp &callback_op) { // NOLINT
|
||||
LoadTile(node_id_adjusted, node_begin, offset, num_remaining);
|
||||
|
||||
// Scan gpair
|
||||
const bool thread_active = threadIdx.x < num_remaining && node_id >= 0;
|
||||
GpairScanT(temp_storage.gpair_scan)
|
||||
.ExclusiveSum(thread_active ? gpair : gpu_gpair(), gpair, callback_op);
|
||||
__syncthreads();
|
||||
// Evaluate split
|
||||
EvaluateSplits(node_id_adjusted, node_begin, offset, num_remaining);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WriteBestSplit(
|
||||
const NodeIdT &node_id_adjusted) {
|
||||
if (threadIdx.x < 32) {
|
||||
bool active = threadIdx.x < N_WARPS;
|
||||
float warp_loss =
|
||||
active ? temp_storage.warp_best_splits[threadIdx.x].loss_chg
|
||||
: -FLT_MAX;
|
||||
if (QueryUpdateWarpSplit(warp_loss, 0, active)) {
|
||||
const int max_nodes = 1 << level;
|
||||
d_split_candidates_out[blockIdx.x * max_nodes + node_id_adjusted] =
|
||||
temp_storage.warp_best_splits[threadIdx.x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessNode(const NodeIdT &node_id_adjusted,
|
||||
const bst_uint &node_begin,
|
||||
const bst_uint &node_end) {
|
||||
dh::block_fill(temp_storage.warp_best_splits, N_WARPS, Split());
|
||||
|
||||
GpairCallbackOp callback_op = GpairCallbackOp();
|
||||
|
||||
bst_uint offset = node_begin;
|
||||
|
||||
while (offset < node_end) {
|
||||
ProcessTile(node_id_adjusted, node_begin, offset, node_end - offset,
|
||||
callback_op);
|
||||
offset += BLOCK_THREADS;
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
WriteBestSplit(node_id_adjusted);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ProcessFeature(const bst_uint &segment_begin,
|
||||
const bst_uint &segment_end) {
|
||||
int node_begin = segment_begin;
|
||||
|
||||
const int max_nodes = 1 << level;
|
||||
|
||||
// Iterate through nodes
|
||||
int active_nodes = 0;
|
||||
for (int i = 0; i < max_nodes; i++) {
|
||||
int node_end = d_block_node_offsets[i];
|
||||
|
||||
if (node_end == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
active_nodes++;
|
||||
|
||||
ProcessNode(i, node_begin, node_end);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
node_begin = node_end;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ __launch_bounds__(1024, 1) void find_split_candidates_sorted_kernel(
|
||||
const ItemIter items_iter, Split *d_split_candidates_out,
|
||||
const Node *d_nodes, bst_uint num_items, const int num_features,
|
||||
const int *d_feature_offsets, gpu_gpair *d_node_sums, int *d_node_offsets,
|
||||
const GPUTrainingParam param, const int *d_feature_flags, const int level) {
|
||||
if (num_items <= 0 || d_feature_flags[blockIdx.x] != 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
bst_uint segment_begin = d_feature_offsets[blockIdx.x];
|
||||
bst_uint segment_end = d_feature_offsets[blockIdx.x + 1];
|
||||
|
||||
typedef ReduceEnactorSorting<BLOCK_THREADS> ReduceT;
|
||||
typedef FindSplitEnactorSorting<BLOCK_THREADS> FindSplitT;
|
||||
|
||||
__shared__ union {
|
||||
typename ReduceT::TempStorage reduce;
|
||||
typename FindSplitT::TempStorage find_split;
|
||||
} temp_storage;
|
||||
|
||||
const int max_modes_level = 1 << level;
|
||||
gpu_gpair *d_block_node_sums = d_node_sums + blockIdx.x * max_modes_level;
|
||||
int *d_block_node_offsets = d_node_offsets + blockIdx.x * max_modes_level;
|
||||
|
||||
ReduceT(temp_storage.reduce, d_block_node_sums, d_block_node_offsets,
|
||||
items_iter, level)
|
||||
.ProcessRegion(segment_begin, segment_end);
|
||||
__syncthreads();
|
||||
|
||||
FindSplitT(temp_storage.find_split, d_block_node_sums, d_block_node_offsets,
|
||||
items_iter, d_nodes, param, d_split_candidates_out, level)
|
||||
.ProcessFeature(segment_begin, segment_end);
|
||||
}
|
||||
|
||||
void find_split_candidates_sorted(GPUData *data, const int level) {
|
||||
const int BLOCK_THREADS = 512;
|
||||
|
||||
CHECK(BLOCK_THREADS / 32 < 32) << "Too many active warps.";
|
||||
|
||||
int grid_size = data->n_features;
|
||||
|
||||
find_split_candidates_sorted_kernel<
|
||||
BLOCK_THREADS><<<grid_size, BLOCK_THREADS>>>(
|
||||
data->items_iter, data->split_candidates.data(), data->nodes.data(),
|
||||
data->fvalues.size(), data->n_features, data->foffsets.data(),
|
||||
data->node_sums.data(), data->node_offsets.data(), data->param,
|
||||
data->feature_flags.data(), level);
|
||||
|
||||
dh::safe_cuda(cudaGetLastError());
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -1,230 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#include "gpu_builder.cuh"
|
||||
#include <cub/cub.cuh>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <thrust/count.h>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/gather.h>
|
||||
#include <thrust/host_vector.h>
|
||||
#include <thrust/sequence.h>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
#include "../../../src/common/random.h"
|
||||
#include "common.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "find_split.cuh"
|
||||
#include "gpu_data.cuh"
|
||||
#include "types.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
GPUBuilder::GPUBuilder() { gpu_data = new GPUData(); }
|
||||
|
||||
void GPUBuilder::Init(const TrainParam& param_in) {
|
||||
param = param_in;
|
||||
CHECK(param.max_depth < 16) << "Tree depth too large.";
|
||||
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
if (!param.silent) {
|
||||
LOG(CONSOLE) << "Device: [" << param.gpu_id << "] " << dh::device_name();
|
||||
}
|
||||
}
|
||||
|
||||
GPUBuilder::~GPUBuilder() { delete gpu_data; }
|
||||
|
||||
void GPUBuilder::UpdateNodeId(int level) {
|
||||
auto* d_node_id_instance = gpu_data->node_id_instance.data();
|
||||
Node* d_nodes = gpu_data->nodes.data();
|
||||
|
||||
dh::launch_n(gpu_data->node_id_instance.size(), [=] __device__(int i) {
|
||||
NodeIdT item_node_id = d_node_id_instance[i];
|
||||
|
||||
if (item_node_id < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
Node node = d_nodes[item_node_id];
|
||||
|
||||
if (node.IsLeaf()) {
|
||||
d_node_id_instance[i] = -1;
|
||||
} else if (node.split.missing_left) {
|
||||
d_node_id_instance[i] = item_node_id * 2 + 1;
|
||||
} else {
|
||||
d_node_id_instance[i] = item_node_id * 2 + 2;
|
||||
}
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
|
||||
auto* d_fvalues = gpu_data->fvalues.data();
|
||||
auto* d_instance_id = gpu_data->instance_id.data();
|
||||
auto* d_node_id = gpu_data->node_id.data();
|
||||
auto* d_feature_id = gpu_data->feature_id.data();
|
||||
|
||||
// Update node based on fvalue where exists
|
||||
dh::launch_n(gpu_data->fvalues.size(), [=] __device__(int i) {
|
||||
NodeIdT item_node_id = d_node_id[i];
|
||||
|
||||
if (item_node_id < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
Node node = d_nodes[item_node_id];
|
||||
|
||||
if (node.IsLeaf()) {
|
||||
return;
|
||||
}
|
||||
|
||||
int feature_id = d_feature_id[i];
|
||||
|
||||
if (feature_id == node.split.findex) {
|
||||
float fvalue = d_fvalues[i];
|
||||
bst_uint instance_id = d_instance_id[i];
|
||||
|
||||
if (fvalue < node.split.fvalue) {
|
||||
d_node_id_instance[instance_id] = item_node_id * 2 + 1;
|
||||
} else {
|
||||
d_node_id_instance[instance_id] = item_node_id * 2 + 2;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaDeviceSynchronize());
|
||||
|
||||
gpu_data->GatherNodeId();
|
||||
}
|
||||
|
||||
void GPUBuilder::Sort(int level) {
|
||||
thrust::sequence(gpu_data->sort_index_in.tbegin(),
|
||||
gpu_data->sort_index_in.tend());
|
||||
|
||||
cub::DoubleBuffer<NodeIdT> d_keys(gpu_data->node_id.data(),
|
||||
gpu_data->node_id_temp.data());
|
||||
cub::DoubleBuffer<int> d_values(gpu_data->sort_index_in.data(),
|
||||
gpu_data->sort_index_out.data());
|
||||
|
||||
size_t temp_size = gpu_data->cub_mem.size();
|
||||
|
||||
cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
gpu_data->cub_mem.data(), temp_size, d_keys, d_values,
|
||||
gpu_data->fvalues.size(), gpu_data->n_features, gpu_data->foffsets.data(),
|
||||
gpu_data->foffsets.data() + 1);
|
||||
|
||||
auto zip = thrust::make_zip_iterator(thrust::make_tuple(
|
||||
gpu_data->fvalues.tbegin(), gpu_data->instance_id.tbegin()));
|
||||
auto zip_temp = thrust::make_zip_iterator(thrust::make_tuple(
|
||||
gpu_data->fvalues_temp.tbegin(), gpu_data->instance_id_temp.tbegin()));
|
||||
thrust::gather(thrust::device_pointer_cast(d_values.Current()),
|
||||
thrust::device_pointer_cast(d_values.Current()) +
|
||||
gpu_data->sort_index_out.size(),
|
||||
zip, zip_temp);
|
||||
thrust::copy(zip_temp, zip_temp + gpu_data->fvalues.size(), zip);
|
||||
|
||||
if (d_keys.Current() == gpu_data->node_id_temp.data()) {
|
||||
thrust::copy(gpu_data->node_id_temp.tbegin(), gpu_data->node_id_temp.tend(),
|
||||
gpu_data->node_id.tbegin());
|
||||
}
|
||||
}
|
||||
|
||||
void GPUBuilder::ColsampleTree() {
|
||||
unsigned n =
|
||||
static_cast<unsigned>(param.colsample_bytree * gpu_data->n_features);
|
||||
CHECK_GT(n, 0);
|
||||
|
||||
feature_set_tree.resize(gpu_data->n_features);
|
||||
std::iota(feature_set_tree.begin(), feature_set_tree.end(), 0);
|
||||
std::shuffle(feature_set_tree.begin(), feature_set_tree.end(),
|
||||
common::GlobalRandom());
|
||||
}
|
||||
|
||||
void GPUBuilder::Update(const std::vector<bst_gpair>& gpair, DMatrix* p_fmat,
|
||||
RegTree* p_tree) {
|
||||
this->InitData(gpair, *p_fmat, *p_tree);
|
||||
this->InitFirstNode();
|
||||
this->ColsampleTree();
|
||||
|
||||
for (int level = 0; level < param.max_depth; level++) {
|
||||
bool use_multiscan_algorithm = level < multiscan_levels;
|
||||
|
||||
if (level > 0) {
|
||||
this->UpdateNodeId(level);
|
||||
}
|
||||
|
||||
if (level > 0 && !use_multiscan_algorithm) {
|
||||
this->Sort(level);
|
||||
}
|
||||
|
||||
find_split(gpu_data, param, level, use_multiscan_algorithm,
|
||||
feature_set_tree, &feature_set_level);
|
||||
}
|
||||
|
||||
dense2sparse_tree(p_tree, gpu_data->nodes.tbegin(), gpu_data->nodes.tend(),
|
||||
param);
|
||||
}
|
||||
|
||||
void GPUBuilder::InitData(const std::vector<bst_gpair>& gpair, DMatrix& fmat,
|
||||
const RegTree& tree) {
|
||||
CHECK(fmat.SingleColBlock()) << "grow_gpu: must have single column block. "
|
||||
"Try setting 'tree_method' parameter to "
|
||||
"'exact'";
|
||||
|
||||
if (gpu_data->IsAllocated()) {
|
||||
gpu_data->Reset(gpair, param.subsample);
|
||||
return;
|
||||
}
|
||||
|
||||
MetaInfo info = fmat.info();
|
||||
|
||||
std::vector<int> foffsets;
|
||||
foffsets.push_back(0);
|
||||
std::vector<int> feature_id;
|
||||
std::vector<float> fvalues;
|
||||
std::vector<bst_uint> instance_id;
|
||||
fvalues.reserve(info.num_col * info.num_row);
|
||||
instance_id.reserve(info.num_col * info.num_row);
|
||||
feature_id.reserve(info.num_col * info.num_row);
|
||||
|
||||
dmlc::DataIter<ColBatch>* iter = fmat.ColIterator();
|
||||
|
||||
while (iter->Next()) {
|
||||
const ColBatch& batch = iter->Value();
|
||||
|
||||
for (int i = 0; i < batch.size; i++) {
|
||||
const ColBatch::Inst& col = batch[i];
|
||||
|
||||
for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
|
||||
it++) {
|
||||
bst_uint inst_id = it->index;
|
||||
fvalues.push_back(it->fvalue);
|
||||
instance_id.push_back(inst_id);
|
||||
feature_id.push_back(i);
|
||||
}
|
||||
foffsets.push_back(fvalues.size());
|
||||
}
|
||||
}
|
||||
|
||||
gpu_data->Init(fvalues, foffsets, instance_id, feature_id, gpair,
|
||||
info.num_row, info.num_col, param.max_depth, param);
|
||||
}
|
||||
|
||||
void GPUBuilder::InitFirstNode() {
|
||||
// Build the root node on the CPU and copy to device
|
||||
gpu_gpair sum_gradients =
|
||||
thrust::reduce(gpu_data->gpair.tbegin(), gpu_data->gpair.tend(),
|
||||
gpu_gpair(0, 0), cub::Sum());
|
||||
|
||||
Node tmp = Node(
|
||||
sum_gradients,
|
||||
CalcGain(gpu_data->param, sum_gradients.grad(), sum_gradients.hess()),
|
||||
CalcWeight(gpu_data->param, sum_gradients.grad(), sum_gradients.hess()));
|
||||
|
||||
thrust::copy_n(&tmp, 1, gpu_data->nodes.tbegin());
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -1,46 +0,0 @@
|
||||
/*!
|
||||
* Copyright 2016 Rory mitchell
|
||||
*/
|
||||
#pragma once
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <vector>
|
||||
#include "../../src/tree/param.h"
|
||||
|
||||
namespace xgboost {
|
||||
|
||||
namespace tree {
|
||||
|
||||
struct gpu_gpair;
|
||||
struct GPUData;
|
||||
|
||||
class GPUBuilder {
|
||||
public:
|
||||
GPUBuilder();
|
||||
void Init(const TrainParam ¶m);
|
||||
~GPUBuilder();
|
||||
|
||||
void UpdateParam(const TrainParam ¶m) { this->param = param; }
|
||||
|
||||
void Update(const std::vector<bst_gpair> &gpair, DMatrix *p_fmat,
|
||||
RegTree *p_tree);
|
||||
|
||||
void UpdateNodeId(int level);
|
||||
|
||||
private:
|
||||
void InitData(const std::vector<bst_gpair> &gpair, DMatrix &fmat, // NOLINT
|
||||
const RegTree &tree);
|
||||
|
||||
void Sort(int level);
|
||||
void InitFirstNode();
|
||||
void ColsampleTree();
|
||||
|
||||
TrainParam param;
|
||||
GPUData *gpu_data;
|
||||
std::vector<int> feature_set_tree;
|
||||
std::vector<int> feature_set_level;
|
||||
|
||||
int multiscan_levels =
|
||||
5; // Number of levels before switching to sorting algorithm
|
||||
};
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
@@ -3,10 +3,10 @@
|
||||
*/
|
||||
#include <xgboost/tree_updater.h>
|
||||
#include <vector>
|
||||
#include "../../src/common/random.h"
|
||||
#include "../../src/common/sync.h"
|
||||
#include "../../src/tree/param.h"
|
||||
#include "gpu_builder.cuh"
|
||||
#include "../../../src/common/random.h"
|
||||
#include "../../../src/common/sync.h"
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "exact/gpu_builder.cuh"
|
||||
#include "gpu_hist_builder.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
@@ -45,7 +45,7 @@ class GPUMaker : public TreeUpdater {
|
||||
protected:
|
||||
// training parameter
|
||||
TrainParam param;
|
||||
GPUBuilder builder;
|
||||
exact::GPUBuilder<int16_t> builder;
|
||||
};
|
||||
|
||||
template <typename TStats>
|
||||
Reference in New Issue
Block a user