[GPU-Plugin] Multi-GPU for grow_gpu_hist histogram method using NVIDIA NCCL. (#2395)

This commit is contained in:
PSEUDOTENSOR / Jonathan McKinney
2017-06-11 13:06:08 -04:00
committed by Rory Mitchell
parent e24f25e0c6
commit 41efe32aa5
19 changed files with 2009 additions and 682 deletions

View File

@@ -168,7 +168,7 @@ void argMaxByKey(Split* nodeSplits, const gpu_gpair* gradScans,
const node_id_t* nodeAssigns, const Node<node_id_t>* nodes, int nUniqKeys,
node_id_t nodeStart, int len, const TrainParam param,
ArgMaxByKeyAlgo algo) {
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(nodeSplits, nUniqKeys, Split());
fillConst<Split,BLKDIM,ITEMS_PER_THREAD>(param.gpu_id, nodeSplits, nUniqKeys, Split());
int nBlks = dh::div_round_up(len, ITEMS_PER_THREAD*BLKDIM);
switch(algo) {
case ABK_GMEM:

View File

@@ -208,7 +208,7 @@ private:
dh::dvec<gpu_gpair> tmpScanGradBuff;
dh::dvec<int> tmpScanKeyBuff;
dh::dvec<int> colIds;
dh::bulk_allocator ba;
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
void findSplit(int level, node_id_t nodeStart, int nNodes) {
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
@@ -226,7 +226,8 @@ private:
void allocateAllData(int offsetSize) {
int tmpBuffSize = scanTempBufferSize(nVals);
ba.allocate(&vals, nVals,
ba.allocate(param.gpu_id,
&vals, nVals,
&vals_cached, nVals,
&instIds, nVals,
&instIds_cached, nVals,
@@ -245,7 +246,7 @@ private:
}
void setupOneTimeData(DMatrix& hMat) {
size_t free_memory = dh::available_memory();
size_t free_memory = dh::available_memory(param.gpu_id);
if (!hMat.SingleColBlock()) {
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
}
@@ -258,7 +259,7 @@ private:
if (!param.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on " << dh::device_name();
<< free_memory / mb_size << " MB on " << dh::device_name(param.gpu_id);
}
}
@@ -340,7 +341,7 @@ private:
colOffsets.data(), vals.current(),
nVals, nCols);
// gather the node assignments across all other columns too
gather<node_id_t>(nodeAssigns.current(), nodeAssignsPerInst.data(),
gather<node_id_t>(param.gpu_id, nodeAssigns.current(), nodeAssignsPerInst.data(),
instIds.current(), nVals);
sortKeys(level);
}
@@ -351,7 +352,7 @@ private:
// but we don't need more than level+1 bits for sorting!
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
0, level+1);
gather<float,int>(vals.other(), vals.current(), instIds.other(),
gather<float,int>(param.gpu_id, vals.other(), vals.current(), instIds.other(),
instIds.current(), nodeLocations.current(), nVals);
vals.buff().selector ^= 1;
instIds.buff().selector ^= 1;