[GPU-Plugin] Multi-GPU gpu_id bug fixes for grow_gpu_hist and grow_gpu methods, and additional documentation for the gpu plugin. (#2463)
This commit is contained in:
committed by
Rory Mitchell
parent
91dae84a00
commit
6b287177c8
@@ -2,16 +2,16 @@
|
||||
* Copyright 2017 XGBoost contributors
|
||||
*/
|
||||
#pragma once
|
||||
#include <cstdio>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "../../../src/common/random.h"
|
||||
#include "../../../src/tree/param.h"
|
||||
#include "device_helpers.cuh"
|
||||
#include "types.cuh"
|
||||
#include <string>
|
||||
#include <stdexcept>
|
||||
#include <cstdio>
|
||||
#include "cub/cub.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "device_helpers.cuh"
|
||||
#include "types.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -172,8 +172,8 @@ inline void subsample_gpair(dh::dvec<gpu_gpair>* p_gpair, float subsample) {
|
||||
}
|
||||
|
||||
inline std::vector<int> col_sample(std::vector<int> features, float colsample) {
|
||||
int n = colsample * features.size();
|
||||
CHECK_GT(n, 0);
|
||||
CHECK_GT(features.size(), 0);
|
||||
int n = std::max(1,static_cast<int>(colsample * features.size()));
|
||||
|
||||
std::shuffle(features.begin(), features.end(), common::GlobalRandom());
|
||||
features.resize(n);
|
||||
@@ -202,17 +202,18 @@ struct GpairCallbackOp {
|
||||
* @param offsets the segments
|
||||
*/
|
||||
template <typename T1, typename T2>
|
||||
void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &vals,
|
||||
int nVals, int nSegs, dh::dvec<int> &offsets, int start=0,
|
||||
int end=sizeof(T1)*8) {
|
||||
void segmentedSort(dh::CubMemory& tmp_mem, dh::dvec2<T1>& keys,
|
||||
dh::dvec2<T2>& vals, int nVals, int nSegs,
|
||||
dh::dvec<int>& offsets, int start = 0,
|
||||
int end = sizeof(T1) * 8) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
|
||||
offsets.data(), offsets.data()+1, start, end));
|
||||
NULL, tmpSize, keys.buff(), vals.buff(), nVals, nSegs, offsets.data(),
|
||||
offsets.data() + 1, start, end));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceSegmentedRadixSort::SortPairs(
|
||||
tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(),
|
||||
nVals, nSegs, offsets.data(), offsets.data()+1, start, end));
|
||||
tmp_mem.d_temp_storage, tmpSize, keys.buff(), vals.buff(), nVals, nSegs,
|
||||
offsets.data(), offsets.data() + 1, start, end));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -223,11 +224,11 @@ void segmentedSort(dh::CubMemory &tmp_mem, dh::dvec2<T1> &keys, dh::dvec2<T2> &v
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
void sumReduction(dh::CubMemory& tmp_mem, dh::dvec<T>& in, dh::dvec<T>& out,
|
||||
int nVals) {
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(),
|
||||
nVals));
|
||||
dh::safe_cuda(
|
||||
cub::DeviceReduce::Sum(NULL, tmpSize, in.data(), out.data(), nVals));
|
||||
tmp_mem.LazyAllocate(tmpSize);
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(tmp_mem.d_temp_storage, tmpSize,
|
||||
in.data(), out.data(), nVals));
|
||||
@@ -239,9 +240,10 @@ void sumReduction(dh::CubMemory &tmp_mem, dh::dvec<T> &in, dh::dvec<T> &out,
|
||||
* @param len number of elements i the buffer
|
||||
* @param def default value to be filled
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void fillConst(int device_idx, T* out, int len, T def) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>(device_idx, len, [=] __device__(int i) { out[i] = def; });
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, len,
|
||||
[=] __device__(int i) { out[i] = def; });
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -253,17 +255,17 @@ void fillConst(int device_idx, T* out, int len, T def) {
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T1, typename T2, int BlkDim=256, int ItemsPerThread=4>
|
||||
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, const int* instId,
|
||||
int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(device_idx, nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
T2 v2 = in2[iid];
|
||||
out1[i] = v1;
|
||||
out2[i] = v2;
|
||||
});
|
||||
template <typename T1, typename T2, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2,
|
||||
const int* instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
T1 v1 = in1[iid];
|
||||
T2 v2 = in2[iid];
|
||||
out1[i] = v1;
|
||||
out2[i] = v2;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -273,13 +275,13 @@ void gather(int device_idx, T1* out1, const T1* in1, T2* out2, const T2* in2, co
|
||||
* @param instId gather indices
|
||||
* @param nVals length of the buffers
|
||||
*/
|
||||
template <typename T, int BlkDim=256, int ItemsPerThread=4>
|
||||
template <typename T, int BlkDim = 256, int ItemsPerThread = 4>
|
||||
void gather(int device_idx, T* out, const T* in, const int* instId, int nVals) {
|
||||
dh::launch_n<ItemsPerThread,BlkDim>
|
||||
(device_idx, nVals, [=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
});
|
||||
dh::launch_n<ItemsPerThread, BlkDim>(device_idx, nVals,
|
||||
[=] __device__(int i) {
|
||||
int iid = instId[i];
|
||||
out[i] = in[iid];
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace tree
|
||||
|
||||
@@ -9,11 +9,11 @@
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <ctime>
|
||||
#include <cub/cub.cuh>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <numeric>
|
||||
#include <cub/cub.cuh>
|
||||
|
||||
#ifndef NCCL
|
||||
#define NCCL 1
|
||||
@@ -29,8 +29,8 @@
|
||||
|
||||
namespace dh {
|
||||
|
||||
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
||||
#define DEV_INLINE __device__ __forceinline__
|
||||
#define HOST_DEV_INLINE __host__ __device__ __forceinline__
|
||||
#define DEV_INLINE __device__ __forceinline__
|
||||
|
||||
/*
|
||||
* Error handling functions
|
||||
@@ -126,6 +126,11 @@ inline std::string device_name(int device_idx) {
|
||||
return std::string(prop.name);
|
||||
}
|
||||
|
||||
// ensure gpu_id is correct, so not dependent upon user knowing details
|
||||
inline int get_device_idx(int gpu_id) {
|
||||
// protect against overrun for gpu_id
|
||||
return (std::abs(gpu_id) + 0) % dh::n_visible_devices();
|
||||
}
|
||||
|
||||
/*
|
||||
* Timers
|
||||
@@ -309,11 +314,13 @@ enum memory_type { DEVICE, DEVICE_MANAGED };
|
||||
|
||||
template <memory_type MemoryT>
|
||||
class bulk_allocator;
|
||||
template <typename T> class dvec2;
|
||||
template <typename T>
|
||||
class dvec2;
|
||||
|
||||
template <typename T>
|
||||
class dvec {
|
||||
friend class dvec2<T>;
|
||||
friend class dvec2<T>;
|
||||
|
||||
private:
|
||||
T *_ptr;
|
||||
size_t _size;
|
||||
@@ -327,9 +334,10 @@ class dvec {
|
||||
_ptr = static_cast<T *>(ptr);
|
||||
_size = size;
|
||||
_device_idx = device_idx;
|
||||
safe_cuda(cudaSetDevice(_device_idx));
|
||||
}
|
||||
|
||||
dvec() : _ptr(NULL), _size(0), _device_idx(0) {}
|
||||
dvec() : _ptr(NULL), _size(0), _device_idx(-1) {}
|
||||
size_t size() const { return _size; }
|
||||
int device_idx() const { return _device_idx; }
|
||||
bool empty() const { return _ptr == NULL || _size == 0; }
|
||||
@@ -378,6 +386,10 @@ class dvec {
|
||||
if (other.device_idx() == this->device_idx()) {
|
||||
thrust::copy(other.tbegin(), other.tend(), this->tbegin());
|
||||
} else {
|
||||
std::cout << "deviceother: " << other.device_idx()
|
||||
<< " devicethis: " << this->device_idx() << std::endl;
|
||||
std::cout << "size deviceother: " << other.size()
|
||||
<< " devicethis: " << this->device_idx() << std::endl;
|
||||
throw std::runtime_error("Cannot copy to/from different devices");
|
||||
}
|
||||
|
||||
@@ -401,26 +413,24 @@ class dvec {
|
||||
*/
|
||||
template <typename T>
|
||||
class dvec2 {
|
||||
|
||||
private:
|
||||
dvec<T> _d1, _d2;
|
||||
cub::DoubleBuffer<T> _buff;
|
||||
int _device_idx;
|
||||
|
||||
|
||||
public:
|
||||
void external_allocate(int device_idx, void *ptr1, void *ptr2, size_t size) {
|
||||
if (!empty()) {
|
||||
throw std::runtime_error("Tried to allocate dvec2 but already allocated");
|
||||
}
|
||||
_device_idx = device_idx;
|
||||
_d1.external_allocate(_device_idx, ptr1, size);
|
||||
_d2.external_allocate(_device_idx, ptr2, size);
|
||||
_buff.d_buffers[0] = static_cast<T *>(ptr1);
|
||||
_buff.d_buffers[1] = static_cast<T *>(ptr2);
|
||||
_buff.selector = 0;
|
||||
_device_idx = device_idx;
|
||||
}
|
||||
dvec2() : _d1(), _d2(), _buff(), _device_idx(0) {}
|
||||
dvec2() : _d1(), _d2(), _buff(), _device_idx(-1) {}
|
||||
|
||||
size_t size() const { return _d1.size(); }
|
||||
int device_idx() const { return _device_idx; }
|
||||
@@ -433,7 +443,7 @@ class dvec2 {
|
||||
|
||||
T *current() { return _buff.Current(); }
|
||||
|
||||
dvec<T> ¤t_dvec() { return _buff.selector == 0? d1() : d2(); }
|
||||
dvec<T> ¤t_dvec() { return _buff.selector == 0 ? d1() : d2(); }
|
||||
|
||||
T *other() { return _buff.Alternate(); }
|
||||
};
|
||||
@@ -459,7 +469,8 @@ class bulk_allocator {
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
||||
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||
get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
@@ -496,20 +507,23 @@ class bulk_allocator {
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
size_t get_size_bytes(dvec2<T> *first_vec, SizeT first_size, Args... args) {
|
||||
return get_size_bytes<T,SizeT>(first_vec, first_size) + get_size_bytes(args...);
|
||||
return get_size_bytes<T, SizeT>(first_vec, first_size) +
|
||||
get_size_bytes(args...);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size) {
|
||||
first_vec->external_allocate(device_idx, static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr+align_round_up(first_size * sizeof(T))),
|
||||
first_size);
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
SizeT first_size) {
|
||||
first_vec->external_allocate(
|
||||
device_idx, static_cast<void *>(ptr),
|
||||
static_cast<void *>(ptr + align_round_up(first_size * sizeof(T))),
|
||||
first_size);
|
||||
}
|
||||
|
||||
template <typename T, typename SizeT, typename... Args>
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec, SizeT first_size,
|
||||
Args... args) {
|
||||
allocate_dvec<T,SizeT>(device_idx, ptr, first_vec, first_size);
|
||||
void allocate_dvec(int device_idx, char *ptr, dvec2<T> *first_vec,
|
||||
SizeT first_size, Args... args) {
|
||||
allocate_dvec<T, SizeT>(device_idx, ptr, first_vec, first_size);
|
||||
ptr += (align_round_up(first_size * sizeof(T)) * 2);
|
||||
allocate_dvec(device_idx, ptr, args...);
|
||||
}
|
||||
@@ -706,11 +720,11 @@ struct BernoulliRng {
|
||||
* @param name name used to track later
|
||||
* @param stream cuda stream where to measure time
|
||||
*/
|
||||
#define TIMEIT(call, name) \
|
||||
do { \
|
||||
dh::Timer t1234; \
|
||||
call; \
|
||||
t1234.printElapsed(name); \
|
||||
} while(0)
|
||||
#define TIMEIT(call, name) \
|
||||
do { \
|
||||
dh::Timer t1234; \
|
||||
call; \
|
||||
t1234.printElapsed(name); \
|
||||
} while (0)
|
||||
|
||||
} // namespace dh
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "../common.cuh"
|
||||
#include "node.cuh"
|
||||
#include "loss_functions.cuh"
|
||||
#include "node.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -45,7 +45,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
|
||||
out.index = b.index;
|
||||
} else if (a.score == b.score) {
|
||||
out.score = a.score;
|
||||
out.index = (a.index < b.index)? a.index : b.index;
|
||||
out.index = (a.index < b.index) ? a.index : b.index;
|
||||
} else {
|
||||
out.score = a.score;
|
||||
out.index = a.index;
|
||||
@@ -54,7 +54,7 @@ HOST_DEV_INLINE Split maxSplit(Split a, Split b) {
|
||||
}
|
||||
|
||||
DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
||||
unsigned long long* intAddress = (unsigned long long*) address;
|
||||
unsigned long long* intAddress = (unsigned long long*)address;
|
||||
unsigned long long old = *intAddress;
|
||||
unsigned long long assumed;
|
||||
do {
|
||||
@@ -65,23 +65,19 @@ DEV_INLINE void atomicArgMax(Split* address, Split val) {
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals,
|
||||
const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam ¶m) {
|
||||
DEV_INLINE void argMaxWithAtomics(
|
||||
int id, Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam& param) {
|
||||
int nodeId = nodeAssigns[id];
|
||||
///@todo: this is really a bad check! but will be fixed when we move
|
||||
/// to key-based reduction
|
||||
if ((id == 0) || !((nodeId == nodeAssigns[id-1]) &&
|
||||
(colIds[id] == colIds[id-1]) &&
|
||||
(vals[id] == vals[id-1]))) {
|
||||
if ((id == 0) ||
|
||||
!((nodeId == nodeAssigns[id - 1]) && (colIds[id] == colIds[id - 1]) &&
|
||||
(vals[id] == vals[id - 1]))) {
|
||||
if (nodeId != UNUSED_NODE) {
|
||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart,
|
||||
nUniqKeys);
|
||||
int sumId = abs2uniqKey(id, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||
gpu_gpair colSum = gradSums[sumId];
|
||||
int uid = nodeId - nodeStart;
|
||||
Node<node_id_t> n = nodes[nodeId];
|
||||
@@ -90,23 +86,20 @@ DEV_INLINE void argMaxWithAtomics(int id, Split* nodeSplits,
|
||||
bool tmp;
|
||||
Split s;
|
||||
gpu_gpair missing = parentSum - colSum;
|
||||
s.score = loss_chg_missing(gradScans[id], missing, parentSum,
|
||||
parentGain, param, tmp);
|
||||
s.score = loss_chg_missing(gradScans[id], missing, parentSum, parentGain,
|
||||
param, tmp);
|
||||
s.index = id;
|
||||
atomicArgMax(nodeSplits+uid, s);
|
||||
} // end if nodeId != UNUSED_NODE
|
||||
} // end if id == 0 ...
|
||||
atomicArgMax(nodeSplits + uid, s);
|
||||
} // end if nodeId != UNUSED_NODE
|
||||
} // end if id == 0 ...
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
__global__ void atomicArgMaxByKeyGmem(
|
||||
Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds, const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < len; id += stride) {
|
||||
@@ -116,19 +109,16 @@ __global__ void atomicArgMaxByKeyGmem(Split* nodeSplits,
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
__global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
__global__ void atomicArgMaxByKeySmem(
|
||||
Split* nodeSplits, const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds, const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys, node_id_t nodeStart, int len,
|
||||
const TrainParam param) {
|
||||
extern __shared__ char sArr[];
|
||||
Split* sNodeSplits = (Split*)sArr;
|
||||
int tid = threadIdx.x;
|
||||
Split defVal;
|
||||
#pragma unroll 1
|
||||
#pragma unroll 1
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
sNodeSplits[i] = defVal;
|
||||
}
|
||||
@@ -142,7 +132,7 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
||||
__syncthreads();
|
||||
for (int i = tid; i < nUniqKeys; i += blockDim.x) {
|
||||
Split s = sNodeSplits[i];
|
||||
atomicArgMax(nodeSplits+i, s);
|
||||
atomicArgMax(nodeSplits + i, s);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,28 +152,30 @@ __global__ void atomicArgMaxByKeySmem(Split* nodeSplits,
|
||||
* @param param training parameters
|
||||
* @param algo which algorithm to use for argmax_by_key
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM=256, int ITEMS_PER_THREAD=4>
|
||||
template <typename node_id_t, int BLKDIM = 256, int ITEMS_PER_THREAD = 4>
|
||||
void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
const gpu_gpair* gradSums, const float* vals,
|
||||
const int* colIds, const node_id_t* nodeAssigns,
|
||||
const Node<node_id_t>* nodes, int nUniqKeys,
|
||||
node_id_t nodeStart, int len, const TrainParam param,
|
||||
ArgMaxByKeyAlgo algo) {
|
||||
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
|
||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
|
||||
switch(algo) {
|
||||
case ABK_GMEM:
|
||||
atomicArgMaxByKeyGmem<node_id_t><<<nBlks,BLKDIM>>>
|
||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
case ABK_SMEM:
|
||||
atomicArgMaxByKeySmem<node_id_t>
|
||||
<<<nBlks,BLKDIM,sizeof(Split)*nUniqKeys>>>
|
||||
(nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||
fillConst<Split, BLKDIM, ITEMS_PER_THREAD>(dh::get_device_idx(param.gpu_id),
|
||||
nodeSplits, nUniqKeys, Split());
|
||||
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD * BLKDIM);
|
||||
switch (algo) {
|
||||
case ABK_GMEM:
|
||||
atomicArgMaxByKeyGmem<node_id_t><<<nBlks, BLKDIM>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
case ABK_SMEM:
|
||||
atomicArgMaxByKeySmem<
|
||||
node_id_t><<<nBlks, BLKDIM, sizeof(Split) * nUniqKeys>>>(
|
||||
nodeSplits, gradScans, gradSums, vals, colIds, nodeAssigns, nodes,
|
||||
nUniqKeys, nodeStart, len, param);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("argMaxByKey: Bad algo passed!");
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
#include "../common.cuh"
|
||||
#include "gradients.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
@@ -41,7 +40,7 @@ static const int NONE_KEY = -100;
|
||||
* @param tmpKeys keys buffer
|
||||
* @param size number of elements that will be scanned
|
||||
*/
|
||||
template <int BLKDIM_L1L3=256>
|
||||
template <int BLKDIM_L1L3 = 256>
|
||||
int scanTempBufferSize(int size) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
return nBlks;
|
||||
@@ -49,7 +48,7 @@ int scanTempBufferSize(int size) {
|
||||
|
||||
struct AddByKey {
|
||||
template <typename T>
|
||||
HOST_DEV_INLINE T operator()(const T &first, const T &second) const {
|
||||
HOST_DEV_INLINE T operator()(const T& first, const T& second) const {
|
||||
T result;
|
||||
if (first.key == second.key) {
|
||||
result.key = first.key;
|
||||
@@ -74,7 +73,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
||||
typedef cub::BlockScan<Pair, BLKDIM_L1L3> BlockScan;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
Pair threadData;
|
||||
int tid = blockIdx.x*BLKDIM_L1L3 + threadIdx.x;
|
||||
int tid = blockIdx.x * BLKDIM_L1L3 + threadIdx.x;
|
||||
if (tid < size) {
|
||||
myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
myValue = get(tid, vals, instIds);
|
||||
@@ -82,7 +81,7 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
||||
myKey = NONE_KEY;
|
||||
myValue = 0.f;
|
||||
}
|
||||
threadData.key = myKey;
|
||||
threadData.key = myKey;
|
||||
threadData.value = myValue;
|
||||
// get previous key, especially needed for the last thread in this block
|
||||
// in order to pass on the partial scan values.
|
||||
@@ -90,18 +89,17 @@ __global__ void cubScanByKeyL1(gpu_gpair* scans, const gpu_gpair* vals,
|
||||
// else, the result of this shuffle operation will be undefined
|
||||
int previousKey = __shfl_up(myKey, 1);
|
||||
// Collectively compute the block-wide exclusive prefix sum
|
||||
BlockScan(temp_storage).ExclusiveScan(threadData, threadData, rootPair,
|
||||
AddByKey());
|
||||
BlockScan(temp_storage)
|
||||
.ExclusiveScan(threadData, threadData, rootPair, AddByKey());
|
||||
if (tid < size) {
|
||||
scans[tid] = threadData.value;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
if (threadIdx.x == BLKDIM_L1L3 - 1) {
|
||||
threadData.value = (myKey == previousKey)?
|
||||
threadData.value :
|
||||
gpu_gpair(0.0f, 0.0f);
|
||||
mKeys[blockIdx.x] = myKey;
|
||||
threadData.value =
|
||||
(myKey == previousKey) ? threadData.value : gpu_gpair(0.0f, 0.0f);
|
||||
mKeys[blockIdx.x] = myKey;
|
||||
mScans[blockIdx.x] = threadData.value + myValue;
|
||||
}
|
||||
}
|
||||
@@ -111,11 +109,10 @@ __global__ void cubScanByKeyL2(gpu_gpair* mScans, int* mKeys, int mLength) {
|
||||
typedef cub::BlockScan<Pair, BLKSIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
|
||||
Pair threadData;
|
||||
__shared__ typename BlockScan::TempStorage temp_storage;
|
||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE-1) {
|
||||
threadData.key = mKeys[i];
|
||||
for (int i = threadIdx.x; i < mLength; i += BLKSIZE - 1) {
|
||||
threadData.key = mKeys[i];
|
||||
threadData.value = mScans[i];
|
||||
BlockScan(temp_storage).InclusiveScan(threadData, threadData,
|
||||
AddByKey());
|
||||
BlockScan(temp_storage).InclusiveScan(threadData, threadData, AddByKey());
|
||||
mScans[i] = threadData.value;
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -136,15 +133,14 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
||||
__shared__ char gradBuff[sizeof(gpu_gpair)];
|
||||
__shared__ int s_mKeys;
|
||||
gpu_gpair* s_mScans = (gpu_gpair*)gradBuff;
|
||||
if(tid >= size)
|
||||
return;
|
||||
if (tid >= size) return;
|
||||
// cache block-wide partial scan info
|
||||
if (relId == 0) {
|
||||
s_mKeys = (blockIdx.x > 0)? mKeys[blockIdx.x-1] : NONE_KEY;
|
||||
s_mScans[0] = (blockIdx.x > 0)? mScans[blockIdx.x-1] : gpu_gpair();
|
||||
s_mKeys = (blockIdx.x > 0) ? mKeys[blockIdx.x - 1] : NONE_KEY;
|
||||
s_mScans[0] = (blockIdx.x > 0) ? mScans[blockIdx.x - 1] : gpu_gpair();
|
||||
}
|
||||
int myKey = abs2uniqKey(tid, keys, colIds, nodeStart, nUniqKeys);
|
||||
int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid-1, keys, colIds,
|
||||
int previousKey = tid == 0 ? NONE_KEY : abs2uniqKey(tid - 1, keys, colIds,
|
||||
nodeStart, nUniqKeys);
|
||||
gpu_gpair myValue = scans[tid];
|
||||
__syncthreads();
|
||||
@@ -162,9 +158,11 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs fused reduce and scan by key functionality. It is assumed that
|
||||
* @brief Performs fused reduce and scan by key functionality. It is assumed
|
||||
* that
|
||||
* the keys occur contiguously!
|
||||
* @param sums the output gradient reductions for each element performed key-wise
|
||||
* @param sums the output gradient reductions for each element performed
|
||||
* key-wise
|
||||
* @param scans the output gradient scans for each element performed key-wise
|
||||
* @param vals the gradients evaluated for each observation.
|
||||
* @param instIds instance ids for each element
|
||||
@@ -179,20 +177,20 @@ __global__ void cubScanByKeyL3(gpu_gpair* sums, gpu_gpair* scans,
|
||||
* @param colIds column indices for each element in the array
|
||||
* @param nodeStart index of the leftmost node in the current level
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM_L1L3=256, int BLKDIM_L2=512>
|
||||
template <typename node_id_t, int BLKDIM_L1L3 = 256, int BLKDIM_L2 = 512>
|
||||
void reduceScanByKey(gpu_gpair* sums, gpu_gpair* scans, const gpu_gpair* vals,
|
||||
const int* instIds, const node_id_t* keys, int size,
|
||||
int nUniqKeys, int nCols, gpu_gpair* tmpScans,
|
||||
int* tmpKeys, const int* colIds, node_id_t nodeStart) {
|
||||
int nBlks = dh::div_round_up(size, BLKDIM_L1L3);
|
||||
cudaMemset(sums, 0, nUniqKeys*nCols*sizeof(gpu_gpair));
|
||||
cubScanByKeyL1<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
||||
(scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
cudaMemset(sums, 0, nUniqKeys * nCols * sizeof(gpu_gpair));
|
||||
cubScanByKeyL1<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
|
||||
scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
cubScanByKeyL2<BLKDIM_L2><<<1, BLKDIM_L2>>>(tmpScans, tmpKeys, nBlks);
|
||||
cubScanByKeyL3<node_id_t,BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>
|
||||
(sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
cubScanByKeyL3<node_id_t, BLKDIM_L1L3><<<nBlks, BLKDIM_L1L3>>>(
|
||||
sums, scans, vals, instIds, tmpScans, tmpKeys, keys, nUniqKeys, colIds,
|
||||
nodeStart, size);
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -15,18 +16,17 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "xgboost/tree_updater.h"
|
||||
#include "cub/cub.cuh"
|
||||
#include "../common.cuh"
|
||||
#include <vector>
|
||||
#include "loss_functions.cuh"
|
||||
#include "gradients.cuh"
|
||||
#include "node.cuh"
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "../common.cuh"
|
||||
#include "argmax_by_key.cuh"
|
||||
#include "split2node.cuh"
|
||||
#include "cub/cub.cuh"
|
||||
#include "fused_scan_reduce_by_key.cuh"
|
||||
|
||||
#include "gradients.cuh"
|
||||
#include "loss_functions.cuh"
|
||||
#include "node.cuh"
|
||||
#include "split2node.cuh"
|
||||
#include "xgboost/tree_updater.h"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -48,8 +48,8 @@ template <typename node_id_t>
|
||||
__global__ void assignColIds(int* colIds, const int* colOffsets) {
|
||||
int myId = blockIdx.x;
|
||||
int start = colOffsets[myId];
|
||||
int end = colOffsets[myId+1];
|
||||
for (int id = start+threadIdx.x; id < end; id += blockDim.x) {
|
||||
int end = colOffsets[myId + 1];
|
||||
for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
|
||||
colIds[id] = myId;
|
||||
}
|
||||
}
|
||||
@@ -70,7 +70,7 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
||||
node_id_t result;
|
||||
if (n.isLeaf() || n.isUnused()) {
|
||||
result = UNUSED_NODE;
|
||||
} else if(n.isDefaultLeft()) {
|
||||
} else if (n.isDefaultLeft()) {
|
||||
result = (2 * n.id) + 1;
|
||||
} else {
|
||||
result = (2 * n.id) + 2;
|
||||
@@ -81,8 +81,9 @@ __global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
|
||||
template <typename node_id_t>
|
||||
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
const node_id_t* nodeIds, const int* instId,
|
||||
const Node<node_id_t>* nodes, const int* colOffsets,
|
||||
const float* vals, int nVals, int nCols) {
|
||||
const Node<node_id_t>* nodes,
|
||||
const int* colOffsets, const float* vals,
|
||||
int nVals, int nCols) {
|
||||
int id = threadIdx.x + (blockIdx.x * blockDim.x);
|
||||
const int stride = blockDim.x * gridDim.x;
|
||||
for (; id < nVals; id += stride) {
|
||||
@@ -95,7 +96,7 @@ __global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
|
||||
if (nId != UNUSED_NODE) {
|
||||
const Node<node_id_t> n = nodes[nId];
|
||||
int colId = n.colIdx;
|
||||
//printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||
// printf("nid=%d colId=%d id=%d\n", nId, colId, id);
|
||||
int start = colOffsets[colId];
|
||||
int end = colOffsets[colId + 1];
|
||||
///@todo: too much wasteful threads!!
|
||||
@@ -114,20 +115,26 @@ __global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
|
||||
int lid = (id << 1) + 1;
|
||||
int rid = (id << 1) + 2;
|
||||
if ((lid >= len) || (rid >= len)) {
|
||||
nodes[id].score = -FLT_MAX; // bottom-most nodes
|
||||
nodes[id].score = -FLT_MAX; // bottom-most nodes
|
||||
} else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
|
||||
nodes[id].score = -FLT_MAX; // unused child nodes
|
||||
nodes[id].score = -FLT_MAX; // unused child nodes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// unit test forward declaration for friend function access
|
||||
template <typename node_id_t> void testSmallData();
|
||||
template <typename node_id_t> void testLargeData();
|
||||
template <typename node_id_t> void testAllocate();
|
||||
template <typename node_id_t> void testMarkLeaves();
|
||||
template <typename node_id_t> void testDense2Sparse();
|
||||
template <typename node_id_t> class GPUBuilder;
|
||||
template <typename node_id_t>
|
||||
void testSmallData();
|
||||
template <typename node_id_t>
|
||||
void testLargeData();
|
||||
template <typename node_id_t>
|
||||
void testAllocate();
|
||||
template <typename node_id_t>
|
||||
void testMarkLeaves();
|
||||
template <typename node_id_t>
|
||||
void testDense2Sparse();
|
||||
template <typename node_id_t>
|
||||
class GPUBuilder;
|
||||
template <typename node_id_t>
|
||||
std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
||||
const std::string& file,
|
||||
@@ -136,7 +143,7 @@ std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
|
||||
template <typename node_id_t>
|
||||
class GPUBuilder {
|
||||
public:
|
||||
GPUBuilder(): allocated(false) {}
|
||||
GPUBuilder() : allocated(false) {}
|
||||
|
||||
~GPUBuilder() {}
|
||||
|
||||
@@ -146,10 +153,10 @@ class GPUBuilder {
|
||||
maxLeaves = 1 << param.max_depth;
|
||||
}
|
||||
|
||||
void UpdateParam(const TrainParam ¶m) { this->param = param; }
|
||||
void UpdateParam(const TrainParam& param) { this->param = param; }
|
||||
|
||||
/// @note: Update should be only after Init!!
|
||||
void Update(const std::vector<bst_gpair>& gpair, DMatrix *hMat,
|
||||
void Update(const std::vector<bst_gpair>& gpair, DMatrix* hMat,
|
||||
RegTree* hTree) {
|
||||
if (!allocated) {
|
||||
setupOneTimeData(*hMat);
|
||||
@@ -171,7 +178,7 @@ class GPUBuilder {
|
||||
dense2sparse(*hTree);
|
||||
}
|
||||
|
||||
private:
|
||||
private:
|
||||
friend void testSmallData<node_id_t>();
|
||||
friend void testLargeData<node_id_t>();
|
||||
friend void testAllocate<node_id_t>();
|
||||
@@ -194,7 +201,7 @@ private:
|
||||
dh::dvec<gpu_gpair> gradsInst;
|
||||
dh::dvec2<node_id_t> nodeAssigns;
|
||||
dh::dvec2<int> nodeLocations;
|
||||
dh::dvec<Node<node_id_t> > nodes;
|
||||
dh::dvec<Node<node_id_t>> nodes;
|
||||
dh::dvec<node_id_t> nodeAssignsPerInst;
|
||||
dh::dvec<gpu_gpair> gradSums;
|
||||
dh::dvec<gpu_gpair> gradScans;
|
||||
@@ -218,35 +225,26 @@ private:
|
||||
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
|
||||
vals.current(), colIds.data(), nodeAssigns.current(),
|
||||
nodes.data(), nNodes, nodeStart, nVals, param,
|
||||
level<=MAX_ABK_LEVELS? ABK_SMEM : ABK_GMEM);
|
||||
level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
|
||||
split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
|
||||
gradSums.data(), vals.current(), colIds.data(), colOffsets.data(),
|
||||
nodeAssigns.current(), nNodes, nodeStart, nCols, param);
|
||||
gradSums.data(), vals.current(), colIds.data(),
|
||||
colOffsets.data(), nodeAssigns.current(), nNodes, nodeStart,
|
||||
nCols, param);
|
||||
}
|
||||
|
||||
void allocateAllData(int offsetSize) {
|
||||
int tmpBuffSize = scanTempBufferSize(nVals);
|
||||
ba.allocate(param.gpu_id,
|
||||
&vals, nVals,
|
||||
&vals_cached, nVals,
|
||||
&instIds, nVals,
|
||||
&instIds_cached, nVals,
|
||||
&colOffsets, offsetSize,
|
||||
&gradsInst, nRows,
|
||||
&nodeAssigns, nVals,
|
||||
&nodeLocations, nVals,
|
||||
&nodes, maxNodes,
|
||||
&nodeAssignsPerInst, nRows,
|
||||
&gradSums, maxLeaves*nCols,
|
||||
&gradScans, nVals,
|
||||
&nodeSplits, maxLeaves,
|
||||
&tmpScanGradBuff, tmpBuffSize,
|
||||
&tmpScanKeyBuff, tmpBuffSize,
|
||||
&colIds, nVals);
|
||||
ba.allocate(dh::get_device_idx(param.gpu_id), &vals, nVals, &vals_cached,
|
||||
nVals, &instIds, nVals, &instIds_cached, nVals, &colOffsets,
|
||||
offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
|
||||
&nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
|
||||
nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
|
||||
&nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
|
||||
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
|
||||
}
|
||||
|
||||
void setupOneTimeData(DMatrix& hMat) {
|
||||
size_t free_memory = dh::available_memory(param.gpu_id);
|
||||
size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
|
||||
if (!hMat.SingleColBlock()) {
|
||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||
}
|
||||
@@ -259,7 +257,8 @@ private:
|
||||
if (!param.silent) {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||
<< free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
|
||||
<< free_memory / mb_size << " MB on "
|
||||
<< dh::device_name(dh::get_device_idx(param.gpu_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -282,9 +281,10 @@ private:
|
||||
iter->BeforeFirst();
|
||||
while (iter->Next()) {
|
||||
const ColBatch& batch = iter->Value();
|
||||
for (int i=0;i<batch.size;i++) {
|
||||
for (int i = 0; i < batch.size; i++) {
|
||||
const ColBatch::Inst& col = batch[i];
|
||||
for (const ColBatch::Entry* it=col.data;it!=col.data+col.length;it++) {
|
||||
for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
|
||||
it++) {
|
||||
int inst_id = static_cast<int>(it->index);
|
||||
fval.push_back(it->fvalue);
|
||||
fId.push_back(inst_id);
|
||||
@@ -301,16 +301,17 @@ private:
|
||||
vals.current_dvec() = fval;
|
||||
instIds.current_dvec() = fId;
|
||||
colOffsets = offset;
|
||||
segmentedSort<float,int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
|
||||
segmentedSort<float, int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
|
||||
vals_cached = vals.current_dvec();
|
||||
instIds_cached = instIds.current_dvec();
|
||||
assignColIds<node_id_t><<<nCols,512>>>(colIds.data(), colOffsets.data());
|
||||
assignColIds<node_id_t><<<nCols, 512>>>(colIds.data(), colOffsets.data());
|
||||
}
|
||||
|
||||
void transferGrads(const std::vector<bst_gpair>& gpair) {
|
||||
// HACK
|
||||
dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
|
||||
sizeof(gpu_gpair)*nRows, cudaMemcpyHostToDevice));
|
||||
sizeof(gpu_gpair) * nRows,
|
||||
cudaMemcpyHostToDevice));
|
||||
// evaluate the full-grad reduction for the root node
|
||||
sumReduction<gpu_gpair>(tmp_mem, gradsInst, gradSums, nRows);
|
||||
}
|
||||
@@ -324,25 +325,23 @@ private:
|
||||
// for root node, just update the gradient/score/weight/id info
|
||||
// before splitting it! Currently all data is on GPU, hence this
|
||||
// stupid little kernel
|
||||
initRootNode<<<1,1>>>(nodes.data(), gradSums.data(), param);
|
||||
initRootNode<<<1, 1>>>(nodes.data(), gradSums.data(), param);
|
||||
} else {
|
||||
const int BlkDim = 256;
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = dh::div_round_up(nRows, BlkDim);
|
||||
fillDefaultNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodes.data(), nRows);
|
||||
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodes.data(), nRows);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = dh::div_round_up(nVals, BlkDim*ItemsPerThread);
|
||||
assignNodeIds<<<nBlks,BlkDim>>>(nodeAssignsPerInst.data(),
|
||||
nodeLocations.current(),
|
||||
nodeAssigns.current(),
|
||||
instIds.current(), nodes.data(),
|
||||
colOffsets.data(), vals.current(),
|
||||
nVals, nCols);
|
||||
nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
|
||||
assignNodeIds<<<nBlks, BlkDim>>>(
|
||||
nodeAssignsPerInst.data(), nodeLocations.current(),
|
||||
nodeAssigns.current(), instIds.current(), nodes.data(),
|
||||
colOffsets.data(), vals.current(), nVals, nCols);
|
||||
// gather the node assignments across all other columns too
|
||||
gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
|
||||
instIds.current(), nVals);
|
||||
gather<node_id_t>(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
|
||||
nodeAssignsPerInst.data(), instIds.current(), nVals);
|
||||
sortKeys(level);
|
||||
}
|
||||
}
|
||||
@@ -351,9 +350,10 @@ private:
|
||||
// segmented-sort the arrays based on node-id's
|
||||
// but we don't need more than level+1 bits for sorting!
|
||||
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
|
||||
0, level+1);
|
||||
gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
|
||||
instIds.current(), nodeLocations.current(), nVals);
|
||||
0, level + 1);
|
||||
gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
|
||||
vals.current(), instIds.other(), instIds.current(),
|
||||
nodeLocations.current(), nVals);
|
||||
vals.buff().selector ^= 1;
|
||||
instIds.buff().selector ^= 1;
|
||||
}
|
||||
@@ -361,11 +361,11 @@ private:
|
||||
void markLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = dh::div_round_up(maxNodes, BlkDim);
|
||||
markLeavesKernel<<<nBlks,BlkDim>>>(nodes.data(), maxNodes);
|
||||
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
|
||||
}
|
||||
|
||||
void dense2sparse(RegTree &tree) {
|
||||
std::vector<Node<node_id_t> > hNodes = nodes.as_vector();
|
||||
void dense2sparse(RegTree& tree) {
|
||||
std::vector<Node<node_id_t>> hNodes = nodes.as_vector();
|
||||
int nodeId = 0;
|
||||
for (int i = 0; i < maxNodes; ++i) {
|
||||
const Node<node_id_t>& n = hNodes[i];
|
||||
@@ -375,7 +375,7 @@ private:
|
||||
++nodeId;
|
||||
} else if (!hNodes[i].isUnused()) {
|
||||
tree.AddChilds(nodeId);
|
||||
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir==LeftDir);
|
||||
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir == LeftDir);
|
||||
tree.stat(nodeId).loss_chg = n.score;
|
||||
tree.stat(nodeId).sum_hess = n.gradSum.h;
|
||||
tree.stat(nodeId).base_weight = n.weight;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -17,7 +18,6 @@
|
||||
|
||||
#include "../common.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
@@ -32,9 +32,9 @@ struct gpu_gpair {
|
||||
/** the 'h_i' as it appears in the xgboost paper */
|
||||
float h;
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair(): g(0.f), h(0.f) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h): g(_g), h(_h) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a): g(a.g), h(a.h) {}
|
||||
HOST_DEV_INLINE gpu_gpair() : g(0.f), h(0.f) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const float& _g, const float& _h) : g(_g), h(_h) {}
|
||||
HOST_DEV_INLINE gpu_gpair(const gpu_gpair& a) : g(a.g), h(a.h) {}
|
||||
|
||||
/**
|
||||
* @brief Checks whether the hessian is more than the defined weight
|
||||
@@ -60,12 +60,12 @@ struct gpu_gpair {
|
||||
|
||||
HOST_DEV_INLINE friend gpu_gpair operator+(const gpu_gpair& a,
|
||||
const gpu_gpair& b) {
|
||||
return gpu_gpair(a.g+b.g, a.h+b.h);
|
||||
return gpu_gpair(a.g + b.g, a.h + b.h);
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE friend gpu_gpair operator-(const gpu_gpair& a,
|
||||
const gpu_gpair& b) {
|
||||
return gpu_gpair(a.g-b.g, a.h-b.h);
|
||||
return gpu_gpair(a.g - b.g, a.h - b.h);
|
||||
}
|
||||
|
||||
HOST_DEV_INLINE gpu_gpair(int value) {
|
||||
@@ -73,7 +73,6 @@ struct gpu_gpair {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Gradient value getter function
|
||||
* @param id the index into the vals or instIds array to which to fetch
|
||||
@@ -81,7 +80,8 @@ struct gpu_gpair {
|
||||
* @param instIds instance index buffer
|
||||
* @return the expected gradient value
|
||||
*/
|
||||
HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals, const int* instIds) {
|
||||
HOST_DEV_INLINE gpu_gpair get(int id, const gpu_gpair* vals,
|
||||
const int* instIds) {
|
||||
id = instIds[id];
|
||||
return vals[id];
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -18,17 +19,13 @@
|
||||
#include "../common.cuh"
|
||||
#include "gradients.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
HOST_DEV_INLINE float device_calc_loss_chg(const TrainParam ¶m,
|
||||
const gpu_gpair &scan,
|
||||
const gpu_gpair &missing,
|
||||
const gpu_gpair &parent_sum,
|
||||
const float &parent_gain,
|
||||
bool missing_left) {
|
||||
HOST_DEV_INLINE float device_calc_loss_chg(
|
||||
const TrainParam ¶m, const gpu_gpair &scan, const gpu_gpair &missing,
|
||||
const gpu_gpair &parent_sum, const float &parent_gain, bool missing_left) {
|
||||
gpu_gpair left = scan;
|
||||
if (missing_left) {
|
||||
left += missing;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights reserved.
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -15,9 +16,8 @@
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "gradients.cuh"
|
||||
#include "../common.cuh"
|
||||
|
||||
#include "gradients.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -34,11 +34,9 @@ enum DefaultDirection {
|
||||
RightDir
|
||||
};
|
||||
|
||||
|
||||
/** used to assign default id to a Node */
|
||||
static const int UNUSED_NODE = -1;
|
||||
|
||||
|
||||
/**
|
||||
* @struct Split node.cuh
|
||||
* @brief Abstraction of a possible split in the decision tree
|
||||
@@ -49,7 +47,7 @@ struct Split {
|
||||
/** index where to split in the DMatrix */
|
||||
int index;
|
||||
|
||||
HOST_DEV_INLINE Split(): score(-FLT_MAX), index(INT_MAX) {}
|
||||
HOST_DEV_INLINE Split() : score(-FLT_MAX), index(INT_MAX) {}
|
||||
|
||||
/**
|
||||
* @brief Whether the split info is valid to be used to create a new child
|
||||
@@ -61,7 +59,6 @@ struct Split {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @struct Node node.cuh
|
||||
* @brief Abstraction of a node in the decision tree
|
||||
@@ -84,9 +81,14 @@ class Node {
|
||||
/** node id (used as key for reduce/scan) */
|
||||
node_id_t id;
|
||||
|
||||
HOST_DEV_INLINE Node(): gradSum(), score(-FLT_MAX), weight(-FLT_MAX),
|
||||
dir(LeftDir), threshold(0.f), colIdx(UNUSED_NODE),
|
||||
id(UNUSED_NODE) {}
|
||||
HOST_DEV_INLINE Node()
|
||||
: gradSum(),
|
||||
score(-FLT_MAX),
|
||||
weight(-FLT_MAX),
|
||||
dir(LeftDir),
|
||||
threshold(0.f),
|
||||
colIdx(UNUSED_NODE),
|
||||
id(UNUSED_NODE) {}
|
||||
|
||||
/** Tells whether this node is part of the decision tree */
|
||||
HOST_DEV_INLINE bool isUnused() const { return (id == UNUSED_NODE); }
|
||||
@@ -100,7 +102,6 @@ class Node {
|
||||
HOST_DEV_INLINE bool isDefaultLeft() const { return (dir == LeftDir); }
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @struct Segment node.cuh
|
||||
* @brief Space inefficient, but super easy to implement structure to define
|
||||
@@ -112,7 +113,7 @@ struct Segment {
|
||||
/** end index of the segment */
|
||||
int end;
|
||||
|
||||
HOST_DEV_INLINE Segment(): start(-1), end(-1) {}
|
||||
HOST_DEV_INLINE Segment() : start(-1), end(-1) {}
|
||||
|
||||
/** Checks whether the current structure defines a valid segment */
|
||||
HOST_DEV_INLINE bool isValid() const {
|
||||
@@ -120,7 +121,6 @@ struct Segment {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @enum NodeType node.cuh
|
||||
* @brief Useful to decribe the node type in a dense BFS-order tree array
|
||||
@@ -134,7 +134,6 @@ enum NodeType {
|
||||
UNUSED
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @brief Absolute BFS order IDs to col-wise unique IDs based on user input
|
||||
* @param tid the index of the element that this thread should access
|
||||
|
||||
@@ -17,9 +17,8 @@
|
||||
|
||||
#include "../../../../src/tree/param.h"
|
||||
#include "gradients.cuh"
|
||||
#include "node.cuh"
|
||||
#include "loss_functions.cuh"
|
||||
|
||||
#include "node.cuh"
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
@@ -39,7 +38,7 @@ namespace exact {
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
||||
const gpu_gpair& grad,
|
||||
const TrainParam ¶m) {
|
||||
const TrainParam& param) {
|
||||
nodes[nid].gradSum = grad;
|
||||
nodes[nid].score = CalcGain(param, grad.g, grad.h);
|
||||
nodes[nid].weight = CalcWeight(param, grad.g, grad.h);
|
||||
@@ -58,18 +57,18 @@ DEV_INLINE void updateOneChildNode(Node<node_id_t>* nodes, int nid,
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateChildNodes(Node<node_id_t>* nodes, int pid,
|
||||
const gpu_gpair& gradL, const gpu_gpair& gradR,
|
||||
const TrainParam ¶m) {
|
||||
const TrainParam& param) {
|
||||
int childId = (pid * 2) + 1;
|
||||
updateOneChildNode(nodes, childId, gradL, param);
|
||||
updateOneChildNode(nodes, childId+1, gradR, param);
|
||||
updateOneChildNode(nodes, childId + 1, gradR, param);
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
||||
const Node<node_id_t>& n, int absNodeId, int colId,
|
||||
const gpu_gpair& gradScan,
|
||||
const Node<node_id_t>& n, int absNodeId,
|
||||
int colId, const gpu_gpair& gradScan,
|
||||
const gpu_gpair& colSum, float thresh,
|
||||
const TrainParam ¶m) {
|
||||
const TrainParam& param) {
|
||||
bool missingLeft = true;
|
||||
// get the default direction for the current node
|
||||
gpu_gpair missing = n.gradSum - colSum;
|
||||
@@ -84,19 +83,17 @@ DEV_INLINE void updateNodeAndChildren(Node<node_id_t>* nodes, const Split& s,
|
||||
rGradSum = n.gradSum - lGradSum;
|
||||
updateChildNodes(nodes, absNodeId, lGradSum, rGradSum, param);
|
||||
// update default-dir, threshold and feature id for current node
|
||||
nodes[absNodeId].dir = missingLeft? LeftDir : RightDir;
|
||||
nodes[absNodeId].dir = missingLeft ? LeftDir : RightDir;
|
||||
nodes[absNodeId].colIdx = colId;
|
||||
nodes[absNodeId].threshold = thresh;
|
||||
}
|
||||
|
||||
template <typename node_id_t, int BLKDIM=256>
|
||||
__global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits,
|
||||
const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals,
|
||||
const int* colIds, const int* colOffsets,
|
||||
const node_id_t* nodeAssigns, int nUniqKeys,
|
||||
node_id_t nodeStart, int nCols,
|
||||
const TrainParam param) {
|
||||
template <typename node_id_t, int BLKDIM = 256>
|
||||
__global__ void split2nodeKernel(
|
||||
Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const int* colOffsets, const node_id_t* nodeAssigns, int nUniqKeys,
|
||||
node_id_t nodeStart, int nCols, const TrainParam param) {
|
||||
int uid = (blockIdx.x * blockDim.x) + threadIdx.x;
|
||||
if (uid >= nUniqKeys) {
|
||||
return;
|
||||
@@ -105,11 +102,11 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
|
||||
Split s = nodeSplits[uid];
|
||||
if (s.isSplittable(param.min_split_loss)) {
|
||||
int idx = s.index;
|
||||
int nodeInstId = abs2uniqKey(idx, nodeAssigns, colIds, nodeStart,
|
||||
nUniqKeys);
|
||||
updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId,
|
||||
colIds[idx], gradScans[idx],
|
||||
gradSums[nodeInstId], vals[idx], param);
|
||||
int nodeInstId =
|
||||
abs2uniqKey(idx, nodeAssigns, colIds, nodeStart, nUniqKeys);
|
||||
updateNodeAndChildren(nodes, s, nodes[absNodeId], absNodeId, colIds[idx],
|
||||
gradScans[idx], gradSums[nodeInstId], vals[idx],
|
||||
param);
|
||||
} else {
|
||||
// cannot be split further, so this node is a leaf!
|
||||
nodes[absNodeId].score = -FLT_MAX;
|
||||
@@ -129,20 +126,20 @@ __global__ void split2nodeKernel(Node<node_id_t>* nodes, const Split* nodeSplits
|
||||
* @param nUniqKeys number of nodes that we are currently working on
|
||||
* @param nodeStart start offset of the nodes in the overall BFS tree
|
||||
* @param nCols number of columns
|
||||
* @param preUniquifiedKeys whether to uniquify the keys from inside kernel or not
|
||||
* @param preUniquifiedKeys whether to uniquify the keys from inside kernel or
|
||||
* not
|
||||
* @param param the training parameter struct
|
||||
*/
|
||||
template <typename node_id_t, int BLKDIM=256>
|
||||
void split2node(Node<node_id_t>* nodes, const Split* nodeSplits, const gpu_gpair* gradScans,
|
||||
const gpu_gpair* gradSums, const float* vals, const int* colIds,
|
||||
const int* colOffsets, const node_id_t* nodeAssigns,
|
||||
int nUniqKeys, node_id_t nodeStart, int nCols,
|
||||
const TrainParam param) {
|
||||
template <typename node_id_t, int BLKDIM = 256>
|
||||
void split2node(Node<node_id_t>* nodes, const Split* nodeSplits,
|
||||
const gpu_gpair* gradScans, const gpu_gpair* gradSums,
|
||||
const float* vals, const int* colIds, const int* colOffsets,
|
||||
const node_id_t* nodeAssigns, int nUniqKeys,
|
||||
node_id_t nodeStart, int nCols, const TrainParam param) {
|
||||
int nBlks = dh::div_round_up(nUniqKeys, BLKDIM);
|
||||
split2nodeKernel<<<nBlks,BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
|
||||
vals, colIds, colOffsets, nodeAssigns,
|
||||
nUniqKeys, nodeStart, nCols,
|
||||
param);
|
||||
split2nodeKernel<<<nBlks, BLKDIM>>>(nodes, nodeSplits, gradScans, gradSums,
|
||||
vals, colIds, colOffsets, nodeAssigns,
|
||||
nUniqKeys, nodeStart, nCols, param);
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
|
||||
@@ -73,11 +73,12 @@ struct GPUData {
|
||||
n_features, foffsets.data(), foffsets.data() + 1);
|
||||
|
||||
// Allocate memory
|
||||
size_t free_memory = dh::available_memory(param_in.gpu_id);
|
||||
ba.allocate(param_in.gpu_id,
|
||||
&fvalues, in_fvalues.size(), &fvalues_temp,
|
||||
in_fvalues.size(), &fvalues_cached, in_fvalues.size(), &foffsets,
|
||||
in_foffsets.size(), &instance_id, in_instance_id.size(),
|
||||
size_t free_memory =
|
||||
dh::available_memory(dh::get_device_idx(param_in.gpu_id));
|
||||
ba.allocate(
|
||||
dh::get_device_idx(param_in.gpu_id), &fvalues, in_fvalues.size(),
|
||||
&fvalues_temp, in_fvalues.size(), &fvalues_cached, in_fvalues.size(),
|
||||
&foffsets, in_foffsets.size(), &instance_id, in_instance_id.size(),
|
||||
&instance_id_temp, in_instance_id.size(), &instance_id_cached,
|
||||
in_instance_id.size(), &feature_id, in_feature_id.size(), &node_id,
|
||||
in_fvalues.size(), &node_id_temp, in_fvalues.size(), &node_id_instance,
|
||||
@@ -91,7 +92,7 @@ struct GPUData {
|
||||
const int mb_size = 1048576;
|
||||
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
|
||||
<< free_memory / mb_size << " MB on "
|
||||
<< dh::device_name(param_in.gpu_id);
|
||||
<< dh::device_name(dh::get_device_idx(param_in.gpu_id));
|
||||
}
|
||||
|
||||
fvalues_cached = in_fvalues;
|
||||
|
||||
@@ -125,7 +125,7 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
||||
// set dList member
|
||||
dList.resize(n_devices);
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
int device_idx = (param.gpu_id + d_idx) % n_devices;
|
||||
int device_idx = (param.gpu_id + d_idx) % dh::n_visible_devices();
|
||||
dList[d_idx] = device_idx;
|
||||
}
|
||||
|
||||
@@ -141,7 +141,8 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
||||
|
||||
// printf("# NCCL: Using devices\n");
|
||||
for (int d_idx = 0; d_idx < n_devices; ++d_idx) {
|
||||
streams[d_idx] = reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
|
||||
streams[d_idx] =
|
||||
reinterpret_cast<cudaStream_t*>(malloc(sizeof(cudaStream_t)));
|
||||
dh::safe_cuda(cudaSetDevice(dList[d_idx]));
|
||||
dh::safe_cuda(cudaStreamCreate(streams[d_idx]));
|
||||
|
||||
@@ -159,10 +160,11 @@ void GPUHistBuilder::InitData(const std::vector<bst_gpair>& gpair,
|
||||
// local find_split group of comms for each case of reduced number of GPUs
|
||||
// to use
|
||||
find_split_comms.resize(
|
||||
n_devices, std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
|
||||
// ok, and best to do
|
||||
// here instead of
|
||||
// repeatedly
|
||||
n_devices,
|
||||
std::vector<ncclComm_t>(n_devices)); // TODO(JCM): Excessive, but
|
||||
// ok, and best to do
|
||||
// here instead of
|
||||
// repeatedly
|
||||
for (int num_d = 1; num_d <= n_devices;
|
||||
++num_d) { // loop over number of devices used
|
||||
dh::safe_nccl(ncclCommInitAll(find_split_comms[num_d - 1].data(), num_d,
|
||||
@@ -377,7 +379,8 @@ void GPUHistBuilder::BuildHist(int depth) {
|
||||
#if (NCCL)
|
||||
// (in-place) reduce each element of histogram (for only current level) across
|
||||
// multiple gpus
|
||||
// TODO(JCM): use out of place with pre-allocated buffer, but then have to copy
|
||||
// TODO(JCM): use out of place with pre-allocated buffer, but then have to
|
||||
// copy
|
||||
// back on device
|
||||
// fprintf(stderr,"sizeof(gpu_gpair)/sizeof(float)=%d\n",sizeof(gpu_gpair)/sizeof(float));
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
@@ -621,110 +624,128 @@ void GPUHistBuilder::LaunchFindSplit(int depth) {
|
||||
bool colsample =
|
||||
param.colsample_bylevel < 1.0 || param.colsample_bytree < 1.0;
|
||||
|
||||
// use power of 2 for split finder because nodes are power of 2 (broadcast
|
||||
// result to remaining devices)
|
||||
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
||||
find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
|
||||
int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
|
||||
int num_nodes_child_device = n_nodes_level(depth + 1) / find_split_n_devices;
|
||||
const int GRID_SIZE = num_nodes_device;
|
||||
int dosimuljob = 1;
|
||||
|
||||
#if (NCCL)
|
||||
// NOTE: No need to scatter before gather as all devices have same copy of
|
||||
// nodes, and within find_split_kernel() nodes_temp is given values from nodes
|
||||
int simuljob = 1; // whether to do job on single GPU and broadcast (0) or to
|
||||
// do same job on each GPU (1) (could make user parameter,
|
||||
// but too fine-grained maybe)
|
||||
int findsplit_shardongpus = 0; // too expensive generally, disable for now
|
||||
|
||||
// for all nodes (split among devices) find best split per node
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
if (NCCL && findsplit_shardongpus) {
|
||||
dosimuljob = 0;
|
||||
// use power of 2 for split finder because nodes are power of 2 (broadcast
|
||||
// result to remaining devices)
|
||||
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
||||
find_split_n_devices = std::min(n_nodes_level(depth), find_split_n_devices);
|
||||
int num_nodes_device = n_nodes_level(depth) / find_split_n_devices;
|
||||
int num_nodes_child_device =
|
||||
n_nodes_level(depth + 1) / find_split_n_devices;
|
||||
const int GRID_SIZE = num_nodes_device;
|
||||
|
||||
int nodes_offset_device = d_idx * num_nodes_device;
|
||||
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
||||
feature_segments[d_idx].data(), depth, (info->num_col),
|
||||
(hmat_.row_ptr.back()), nodes[d_idx].data(),
|
||||
nodes_temp[d_idx].data(), nodes_child_temp[d_idx].data(),
|
||||
nodes_offset_device, fidx_min_map[d_idx].data(),
|
||||
gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||
left_child_smallest_temp[d_idx].data(), colsample,
|
||||
feature_flags[d_idx].data());
|
||||
}
|
||||
// NOTE: No need to scatter before gather as all devices have same copy of
|
||||
// nodes, and within find_split_kernel() nodes_temp is given values from
|
||||
// nodes
|
||||
|
||||
// nccl only on devices that did split
|
||||
dh::synchronize_n_devices(find_split_n_devices, dList);
|
||||
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
dh::safe_nccl(ncclAllGather(
|
||||
reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
|
||||
num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||
find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
|
||||
|
||||
if (depth !=
|
||||
param.max_depth) { // don't copy over children nodes if no more nodes
|
||||
dh::safe_nccl(
|
||||
ncclAllGather(reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
|
||||
num_nodes_child_device * sizeof(Node) / sizeof(char),
|
||||
ncclChar, reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||
*(streams[d_idx]))); // Note offset by n_nodes(depth)
|
||||
// for recvbuff for child nodes
|
||||
}
|
||||
|
||||
dh::safe_nccl(ncclAllGather(
|
||||
reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
|
||||
num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
|
||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
|
||||
find_split_comms[find_split_n_devices - 1][d_idx], *(streams[d_idx])));
|
||||
}
|
||||
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||
}
|
||||
|
||||
if (n_devices > find_split_n_devices && n_devices > 1) {
|
||||
// if n_devices==1, no need to Bcast
|
||||
// if find_split_n_devices==1, this is just a copy operation, else it copies
|
||||
// from master to all nodes in case extra devices not involved in split
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
// for all nodes (split among devices) find best split per node
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
int master_device = dList[0];
|
||||
dh::safe_nccl(
|
||||
ncclBcast(reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(Node) / sizeof(char),
|
||||
ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
|
||||
int nodes_offset_device = d_idx * num_nodes_device;
|
||||
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
||||
feature_segments[d_idx].data(), depth, (info->num_col),
|
||||
(hmat_.row_ptr.back()), nodes[d_idx].data(), nodes_temp[d_idx].data(),
|
||||
nodes_child_temp[d_idx].data(), nodes_offset_device,
|
||||
fidx_min_map[d_idx].data(), gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||
left_child_smallest_temp[d_idx].data(), colsample,
|
||||
feature_flags[d_idx].data());
|
||||
}
|
||||
|
||||
// nccl only on devices that did split
|
||||
dh::synchronize_n_devices(find_split_n_devices, dList);
|
||||
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
dh::safe_nccl(ncclAllGather(
|
||||
reinterpret_cast<const void*>(nodes_temp[d_idx].data()),
|
||||
num_nodes_device * sizeof(Node) / sizeof(char), ncclChar,
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||
*(streams[d_idx])));
|
||||
|
||||
if (depth !=
|
||||
param.max_depth) { // don't copy over children nodes if no more nodes
|
||||
dh::safe_nccl(ncclBcast(
|
||||
dh::safe_nccl(ncclAllGather(
|
||||
reinterpret_cast<const void*>(nodes_child_temp[d_idx].data()),
|
||||
num_nodes_child_device * sizeof(Node) / sizeof(char), ncclChar,
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||
*(streams[d_idx]))); // Note offset by n_nodes(depth)
|
||||
// for recvbuff for child nodes
|
||||
}
|
||||
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() + n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
dh::safe_nccl(ncclAllGather(
|
||||
reinterpret_cast<const void*>(left_child_smallest_temp[d_idx].data()),
|
||||
num_nodes_device * sizeof(bool) / sizeof(char), ncclChar,
|
||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||
n_nodes(depth - 1)),
|
||||
find_split_comms[find_split_n_devices - 1][d_idx],
|
||||
*(streams[d_idx])));
|
||||
}
|
||||
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
for (int d_idx = 0; d_idx < find_split_n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
{
|
||||
if (n_devices > find_split_n_devices && n_devices > 1) {
|
||||
// if n_devices==1, no need to Bcast
|
||||
// if find_split_n_devices==1, this is just a copy operation, else it
|
||||
// copies
|
||||
// from master to all nodes in case extra devices not involved in split
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
int master_device = dList[0];
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
|
||||
if (depth != param.max_depth) { // don't copy over children nodes if no
|
||||
// more nodes
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
}
|
||||
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||
n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(bool) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
}
|
||||
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||
}
|
||||
}
|
||||
} else if (simuljob == 0 && NCCL == 1) {
|
||||
dosimuljob = 0;
|
||||
int num_nodes_device = n_nodes_level(depth);
|
||||
const int GRID_SIZE = num_nodes_device;
|
||||
|
||||
int d_idx = 0;
|
||||
int master_device = dList[d_idx];
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
@@ -737,9 +758,63 @@ void GPUHistBuilder::LaunchFindSplit(int depth) {
|
||||
gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||
left_child_smallest[d_idx].data(), colsample,
|
||||
feature_flags[d_idx].data());
|
||||
|
||||
// broadcast result
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(Node) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
|
||||
if (depth !=
|
||||
param.max_depth) { // don't copy over children nodes if no more nodes
|
||||
dh::safe_nccl(ncclBcast(
|
||||
reinterpret_cast<void*>(nodes[d_idx].data() + n_nodes(depth)),
|
||||
n_nodes_level(depth + 1) * sizeof(Node) / sizeof(char), ncclChar,
|
||||
master_device, comms[d_idx], *(streams[d_idx])));
|
||||
}
|
||||
|
||||
dh::safe_nccl(
|
||||
ncclBcast(reinterpret_cast<void*>(left_child_smallest[d_idx].data() +
|
||||
n_nodes(depth - 1)),
|
||||
n_nodes_level(depth) * sizeof(bool) / sizeof(char),
|
||||
ncclChar, master_device, comms[d_idx], *(streams[d_idx])));
|
||||
}
|
||||
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
dh::safe_cuda(cudaStreamSynchronize(*(streams[d_idx])));
|
||||
}
|
||||
} else {
|
||||
dosimuljob = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dosimuljob) { // if no NCCL or simuljob==1, do this
|
||||
int num_nodes_device = n_nodes_level(depth);
|
||||
const int GRID_SIZE = num_nodes_device;
|
||||
|
||||
// all GPUs do same work
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
|
||||
int nodes_offset_device = 0;
|
||||
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||
(const gpu_gpair*)(hist_vec[d_idx].GetLevelPtr(depth)),
|
||||
feature_segments[d_idx].data(), depth, (info->num_col),
|
||||
(hmat_.row_ptr.back()), nodes[d_idx].data(), NULL, NULL,
|
||||
nodes_offset_device, fidx_min_map[d_idx].data(),
|
||||
gidx_fvalue_map[d_idx].data(), gpu_param,
|
||||
left_child_smallest[d_idx].data(), colsample,
|
||||
feature_flags[d_idx].data());
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: No need to syncrhonize with host as all above pure P2P ops or
|
||||
// on-device ops
|
||||
}
|
||||
@@ -776,17 +851,15 @@ void GPUHistBuilder::InitFirstNode(const std::vector<bst_gpair>& gpair) {
|
||||
|
||||
std::vector<std::future<gpu_gpair>> future_results(n_devices);
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
|
||||
auto begin = device_gpair[d_idx].tbegin();
|
||||
auto end = device_gpair[d_idx].tend();
|
||||
gpu_gpair init = gpu_gpair();
|
||||
auto binary_op = thrust::plus<gpu_gpair>();
|
||||
|
||||
// std::async captures the algorithm parameters by value
|
||||
// use std::launch::async to ensure the creation of a new thread
|
||||
future_results[d_idx] = std::async(std::launch::async, [=] {
|
||||
int device_idx = dList[d_idx];
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
auto begin = device_gpair[d_idx].tbegin();
|
||||
auto end = device_gpair[d_idx].tend();
|
||||
gpu_gpair init = gpu_gpair();
|
||||
auto binary_op = thrust::plus<gpu_gpair>();
|
||||
return thrust::reduce(begin, end, init, binary_op);
|
||||
});
|
||||
}
|
||||
@@ -1047,8 +1120,8 @@ void GPUHistBuilder::Update(const std::vector<bst_gpair>& gpair,
|
||||
|
||||
// done with multi-GPU, pass back result from master to tree on host
|
||||
int master_device = dList[0];
|
||||
dense2sparse_tree(p_tree, nodes[master_device].tbegin(),
|
||||
nodes[master_device].tend(), param);
|
||||
dh::safe_cuda(cudaSetDevice(master_device));
|
||||
dense2sparse_tree(p_tree, nodes[0].tbegin(), nodes[0].tend(), param);
|
||||
}
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
|
||||
Reference in New Issue
Block a user