xgboost/plugin/updater_gpu/src/exact/gpu_builder.cuh

391 lines
14 KiB
Plaintext

/*
* Copyright (c) 2017, NVIDIA CORPORATION, Xgboost contributors. All rights
* reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <vector>
#include "../../../../src/tree/param.h"
#include "../common.cuh"
#include <vector>
#include "node.cuh"
#include "split2node.cuh"
#include "argmax_by_key.cuh"
#include "fused_scan_reduce_by_key.cuh"
#include "xgboost/tree_updater.h"
namespace xgboost {
namespace tree {
namespace exact {
template <typename node_id_t>
__global__ void initRootNode(Node<node_id_t>* nodes, const bst_gpair* sums,
const TrainParam param) {
// gradients already evaluated inside transferGrads
Node<node_id_t> n;
n.gradSum = sums[0];
n.score = CalcGain(param, n.gradSum.grad , n.gradSum.hess);
n.weight = CalcWeight(param, n.gradSum.grad , n.gradSum.hess);
n.id = 0;
nodes[0] = n;
}
template <typename node_id_t>
__global__ void assignColIds(int* colIds, const int* colOffsets) {
int myId = blockIdx.x;
int start = colOffsets[myId];
int end = colOffsets[myId + 1];
for (int id = start + threadIdx.x; id < end; id += blockDim.x) {
colIds[id] = myId;
}
}
template <typename node_id_t>
__global__ void fillDefaultNodeIds(node_id_t* nodeIdsPerInst,
const Node<node_id_t>* nodes, int nRows) {
int id = threadIdx.x + (blockIdx.x * blockDim.x);
if (id >= nRows) {
return;
}
// if this element belongs to none of the currently active node-id's
node_id_t nId = nodeIdsPerInst[id];
if (nId == UNUSED_NODE) {
return;
}
const Node<node_id_t> n = nodes[nId];
node_id_t result;
if (n.isLeaf() || n.isUnused()) {
result = UNUSED_NODE;
} else if (n.isDefaultLeft()) {
result = (2 * n.id) + 1;
} else {
result = (2 * n.id) + 2;
}
nodeIdsPerInst[id] = result;
}
template <typename node_id_t>
__global__ void assignNodeIds(node_id_t* nodeIdsPerInst, int* nodeLocations,
const node_id_t* nodeIds, const int* instId,
const Node<node_id_t>* nodes,
const int* colOffsets, const float* vals,
int nVals, int nCols) {
int id = threadIdx.x + (blockIdx.x * blockDim.x);
const int stride = blockDim.x * gridDim.x;
for (; id < nVals; id += stride) {
// fusing generation of indices for node locations
nodeLocations[id] = id;
// using nodeIds here since the previous kernel would have updated
// the nodeIdsPerInst with all default assignments
int nId = nodeIds[id];
// if this element belongs to none of the currently active node-id's
if (nId != UNUSED_NODE) {
const Node<node_id_t> n = nodes[nId];
int colId = n.colIdx;
// printf("nid=%d colId=%d id=%d\n", nId, colId, id);
int start = colOffsets[colId];
int end = colOffsets[colId + 1];
///@todo: too much wasteful threads!!
if ((id >= start) && (id < end) && !(n.isLeaf() || n.isUnused())) {
node_id_t result = (2 * n.id) + 1 + (vals[id] >= n.threshold);
nodeIdsPerInst[instId[id]] = result;
}
}
}
}
template <typename node_id_t>
__global__ void markLeavesKernel(Node<node_id_t>* nodes, int len) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
if ((id < len) && !nodes[id].isUnused()) {
int lid = (id << 1) + 1;
int rid = (id << 1) + 2;
if ((lid >= len) || (rid >= len)) {
nodes[id].score = -FLT_MAX; // bottom-most nodes
} else if (nodes[lid].isUnused() && nodes[rid].isUnused()) {
nodes[id].score = -FLT_MAX; // unused child nodes
}
}
}
// unit test forward declaration for friend function access
template <typename node_id_t>
void testSmallData();
template <typename node_id_t>
void testLargeData();
template <typename node_id_t>
void testAllocate();
template <typename node_id_t>
void testMarkLeaves();
template <typename node_id_t>
void testDense2Sparse();
template <typename node_id_t>
class GPUBuilder;
template <typename node_id_t>
std::shared_ptr<xgboost::DMatrix> setupGPUBuilder(
const std::string& file,
xgboost::tree::exact::GPUBuilder<node_id_t>& builder);
template <typename node_id_t>
class GPUBuilder {
public:
GPUBuilder() : allocated(false) {}
~GPUBuilder() {}
void Init(const TrainParam& p) {
param = p;
maxNodes = (1 << (param.max_depth + 1)) - 1;
maxLeaves = 1 << param.max_depth;
}
void UpdateParam(const TrainParam& param) { this->param = param; }
/// @note: Update should be only after Init!!
void Update(const std::vector<bst_gpair>& gpair, DMatrix* hMat,
RegTree* hTree) {
if (!allocated) {
setupOneTimeData(*hMat);
}
for (int i = 0; i < param.max_depth; ++i) {
if (i == 0) {
// make sure to start on a fresh tree with sorted values!
vals.current_dvec() = vals_cached;
instIds.current_dvec() = instIds_cached;
transferGrads(gpair);
}
int nNodes = 1 << i;
node_id_t nodeStart = nNodes - 1;
initNodeData(i, nodeStart, nNodes);
findSplit(i, nodeStart, nNodes);
}
// mark all the used nodes with unused children as leaf nodes
markLeaves();
dense2sparse(*hTree);
}
private:
friend void testSmallData<node_id_t>();
friend void testLargeData<node_id_t>();
friend void testAllocate<node_id_t>();
friend void testMarkLeaves<node_id_t>();
friend void testDense2Sparse<node_id_t>();
friend std::shared_ptr<xgboost::DMatrix> setupGPUBuilder<node_id_t>(
const std::string& file, GPUBuilder<node_id_t>& builder);
TrainParam param;
/** whether we have initialized memory already (so as not to repeat!) */
bool allocated;
/** feature values stored in column-major compressed format */
dh::dvec2<float> vals;
dh::dvec<float> vals_cached;
/** corresponding instance id's of these featutre values */
dh::dvec2<int> instIds;
dh::dvec<int> instIds_cached;
/** column offsets for these feature values */
dh::dvec<int> colOffsets;
dh::dvec<bst_gpair> gradsInst;
dh::dvec2<node_id_t> nodeAssigns;
dh::dvec2<int> nodeLocations;
dh::dvec<Node<node_id_t>> nodes;
dh::dvec<node_id_t> nodeAssignsPerInst;
dh::dvec<bst_gpair> gradSums;
dh::dvec<bst_gpair> gradScans;
dh::dvec<Split> nodeSplits;
int nVals;
int nRows;
int nCols;
int maxNodes;
int maxLeaves;
dh::CubMemory tmp_mem;
dh::dvec<bst_gpair> tmpScanGradBuff;
dh::dvec<int> tmpScanKeyBuff;
dh::dvec<int> colIds;
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
void findSplit(int level, node_id_t nodeStart, int nNodes) {
reduceScanByKey(gradSums.data(), gradScans.data(), gradsInst.data(),
instIds.current(), nodeAssigns.current(), nVals, nNodes,
nCols, tmpScanGradBuff.data(), tmpScanKeyBuff.data(),
colIds.data(), nodeStart);
argMaxByKey(nodeSplits.data(), gradScans.data(), gradSums.data(),
vals.current(), colIds.data(), nodeAssigns.current(),
nodes.data(), nNodes, nodeStart, nVals, param,
level <= MAX_ABK_LEVELS ? ABK_SMEM : ABK_GMEM);
split2node(nodes.data(), nodeSplits.data(), gradScans.data(),
gradSums.data(), vals.current(), colIds.data(),
colOffsets.data(), nodeAssigns.current(), nNodes, nodeStart,
nCols, param);
}
void allocateAllData(int offsetSize) {
int tmpBuffSize = scanTempBufferSize(nVals);
ba.allocate(dh::get_device_idx(param.gpu_id), &vals, nVals, &vals_cached,
nVals, &instIds, nVals, &instIds_cached, nVals, &colOffsets,
offsetSize, &gradsInst, nRows, &nodeAssigns, nVals,
&nodeLocations, nVals, &nodes, maxNodes, &nodeAssignsPerInst,
nRows, &gradSums, maxLeaves * nCols, &gradScans, nVals,
&nodeSplits, maxLeaves, &tmpScanGradBuff, tmpBuffSize,
&tmpScanKeyBuff, tmpBuffSize, &colIds, nVals);
}
void setupOneTimeData(DMatrix& hMat) {
size_t free_memory = dh::available_memory(dh::get_device_idx(param.gpu_id));
if (!hMat.SingleColBlock()) {
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
}
std::vector<float> fval;
std::vector<int> fId, offset;
convertToCsc(hMat, fval, fId, offset);
allocateAllData((int)offset.size());
transferAndSortData(fval, fId, offset);
allocated = true;
if (!param.silent) {
const int mb_size = 1048576;
LOG(CONSOLE) << "Allocated " << ba.size() / mb_size << "/"
<< free_memory / mb_size << " MB on "
<< dh::device_name(dh::get_device_idx(param.gpu_id));
}
}
void convertToCsc(DMatrix& hMat, std::vector<float>& fval,
std::vector<int>& fId, std::vector<int>& offset) {
MetaInfo info = hMat.info();
nRows = info.num_row;
nCols = info.num_col;
offset.reserve(nCols + 1);
offset.push_back(0);
fval.reserve(nCols * nRows);
fId.reserve(nCols * nRows);
// in case you end up with a DMatrix having no column access
// then make sure to enable that before copying the data!
if (!hMat.HaveColAccess()) {
const std::vector<bool> enable(nCols, true);
hMat.InitColAccess(enable, 1, nRows);
}
dmlc::DataIter<ColBatch>* iter = hMat.ColIterator();
iter->BeforeFirst();
while (iter->Next()) {
const ColBatch& batch = iter->Value();
for (int i = 0; i < batch.size; i++) {
const ColBatch::Inst& col = batch[i];
for (const ColBatch::Entry* it = col.data; it != col.data + col.length;
it++) {
int inst_id = static_cast<int>(it->index);
fval.push_back(it->fvalue);
fId.push_back(inst_id);
}
offset.push_back(fval.size());
}
}
nVals = fval.size();
}
void transferAndSortData(const std::vector<float>& fval,
const std::vector<int>& fId,
const std::vector<int>& offset) {
vals.current_dvec() = fval;
instIds.current_dvec() = fId;
colOffsets = offset;
segmentedSort<float, int>(tmp_mem, vals, instIds, nVals, nCols, colOffsets);
vals_cached = vals.current_dvec();
instIds_cached = instIds.current_dvec();
assignColIds<node_id_t><<<nCols, 512>>>(colIds.data(), colOffsets.data());
}
void transferGrads(const std::vector<bst_gpair>& gpair) {
// HACK
dh::safe_cuda(cudaMemcpy(gradsInst.data(), &(gpair[0]),
sizeof(bst_gpair) * nRows,
cudaMemcpyHostToDevice));
// evaluate the full-grad reduction for the root node
sumReduction<bst_gpair>(tmp_mem, gradsInst, gradSums, nRows);
}
void initNodeData(int level, node_id_t nodeStart, int nNodes) {
// all instances belong to root node at the beginning!
if (level == 0) {
nodes.fill(Node<node_id_t>());
nodeAssigns.current_dvec().fill(0);
nodeAssignsPerInst.fill(0);
// for root node, just update the gradient/score/weight/id info
// before splitting it! Currently all data is on GPU, hence this
// stupid little kernel
initRootNode<<<1, 1>>>(nodes.data(), gradSums.data(), param);
} else {
const int BlkDim = 256;
const int ItemsPerThread = 4;
// assign default node ids first
int nBlks = dh::div_round_up(nRows, BlkDim);
fillDefaultNodeIds<<<nBlks, BlkDim>>>(nodeAssignsPerInst.data(),
nodes.data(), nRows);
// evaluate the correct child indices of non-missing values next
nBlks = dh::div_round_up(nVals, BlkDim * ItemsPerThread);
assignNodeIds<<<nBlks, BlkDim>>>(
nodeAssignsPerInst.data(), nodeLocations.current(),
nodeAssigns.current(), instIds.current(), nodes.data(),
colOffsets.data(), vals.current(), nVals, nCols);
// gather the node assignments across all other columns too
gather<node_id_t>(dh::get_device_idx(param.gpu_id), nodeAssigns.current(),
nodeAssignsPerInst.data(), instIds.current(), nVals);
sortKeys(level);
}
}
void sortKeys(int level) {
// segmented-sort the arrays based on node-id's
// but we don't need more than level+1 bits for sorting!
segmentedSort(tmp_mem, nodeAssigns, nodeLocations, nVals, nCols, colOffsets,
0, level + 1);
gather<float, int>(dh::get_device_idx(param.gpu_id), vals.other(),
vals.current(), instIds.other(), instIds.current(),
nodeLocations.current(), nVals);
vals.buff().selector ^= 1;
instIds.buff().selector ^= 1;
}
void markLeaves() {
const int BlkDim = 128;
int nBlks = dh::div_round_up(maxNodes, BlkDim);
markLeavesKernel<<<nBlks, BlkDim>>>(nodes.data(), maxNodes);
}
void dense2sparse(RegTree& tree) {
std::vector<Node<node_id_t>> hNodes = nodes.as_vector();
int nodeId = 0;
for (int i = 0; i < maxNodes; ++i) {
const Node<node_id_t>& n = hNodes[i];
if ((i != 0) && hNodes[i].isLeaf()) {
tree[nodeId].set_leaf(n.weight * param.learning_rate);
tree.stat(nodeId).sum_hess = n.gradSum.hess;
++nodeId;
} else if (!hNodes[i].isUnused()) {
tree.AddChilds(nodeId);
tree[nodeId].set_split(n.colIdx, n.threshold, n.dir == LeftDir);
tree.stat(nodeId).loss_chg = n.score;
tree.stat(nodeId).sum_hess = n.gradSum.hess;
tree.stat(nodeId).base_weight = n.weight;
tree[tree[nodeId].cleft()].set_leaf(0);
tree[tree[nodeId].cright()].set_leaf(0);
++nodeId;
}
}
}
};
} // namespace exact
} // namespace tree
} // namespace xgboost