[GPU-Plugin] Integration of a faster version of grow_gpu plugin into mainstream (#2360)
* Integrating a faster version of grow_gpu plugin 1. Removed the older files to reduce duplication 2. Moved all of the grow_gpu files under 'exact' folder 3. All of them are inside 'exact' namespace to avoid any conflicts 4. Fixed a bug in benchmark.py while running only 'grow_gpu' plugin 5. Added cub and googletest submodules to ease integration and unit-testing 6. Updates to CMakeLists.txt to directly build cuda objects into libxgboost * Added support for building gpu plugins through make flow 1. updated makefile and config.mk to add right targets 2. added unit-tests for gpu exact plugin code * 1. Added support for building gpu plugin using 'make' flow as well 2. Updated instructions for building and testing gpu plugin * Fix travis-ci errors for PR#2360 1. lint errors on unit-tests 2. removed googletest, instead depended upon dmlc-core provide gtest cache * Some more fixes to travis-ci lint failures PR#2360 * Added Rory's copyrights to the files containing code from both. * updated copyright statement as per Rory's request * moved the static datasets into a script to generate them at runtime * 1. memory usage print when silent=0 2. tests/ and test/ folder organization 3. removal of the dependency of googletest for just building xgboost 4. coding style updates for .cuh as well * Fixes for compilation warnings * add cuda object files as well when JVM_BINDINGS=ON
This commit is contained in:
128
plugin/updater_gpu/test/cpp/argmax_by_key.cu
Normal file
128
plugin/updater_gpu/test/cpp/argmax_by_key.cu
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "gtest/gtest.h"
|
||||
#include "../../src/exact/argmax_by_key.cuh"
|
||||
#include "../../src/exact/gradients.cuh"
|
||||
#include "../../src/exact/node.cuh"
|
||||
#include "../../src/exact/loss_functions.cuh"
|
||||
#include "utils.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
TEST(ArgMaxByKey, maxSplit) {
|
||||
Split a, b, out;
|
||||
a.score = 2.f;
|
||||
a.index = 3;
|
||||
b.score = 3.f;
|
||||
b.index = 4;
|
||||
out = maxSplit(a, b);
|
||||
EXPECT_FLOAT_EQ(out.score, b.score);
|
||||
EXPECT_EQ(out.index, b.index);
|
||||
b.score = 2.f;
|
||||
b.index = 4;
|
||||
out = maxSplit(a, b);
|
||||
EXPECT_FLOAT_EQ(out.score, a.score);
|
||||
EXPECT_EQ(out.index, a.index);
|
||||
b.score = 2.f;
|
||||
b.index = 2;
|
||||
out = maxSplit(a, b);
|
||||
EXPECT_FLOAT_EQ(out.score, a.score);
|
||||
EXPECT_EQ(out.index, b.index);
|
||||
b.score = 1.f;
|
||||
b.index = 1;
|
||||
out = maxSplit(a, b);
|
||||
EXPECT_FLOAT_EQ(out.score, a.score);
|
||||
EXPECT_EQ(out.index, a.index);
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void argMaxTest(ArgMaxByKeyAlgo algo) {
|
||||
const int nVals = 1024;
|
||||
const int level = 0;
|
||||
const int nKeys = 1 << level;
|
||||
gpu_gpair* scans = new gpu_gpair[nVals];
|
||||
float* vals = new float[nVals];
|
||||
int* colIds = new int[nVals];
|
||||
scans[0] = gpu_gpair();
|
||||
vals[0] = 0.f;
|
||||
colIds[0] = 0;
|
||||
for (int i = 1; i < nVals; ++i) {
|
||||
scans[i].g = scans[i-1].g + (0.1f * 2.f);
|
||||
scans[i].h = scans[i-1].h + (0.1f * 2.f);
|
||||
vals[i] = static_cast<float>(i) * 0.1f;
|
||||
colIds[i] = 0;
|
||||
}
|
||||
float* dVals;
|
||||
allocateAndUpdateOnGpu<float>(dVals, vals, nVals);
|
||||
gpu_gpair* dScans;
|
||||
allocateAndUpdateOnGpu<gpu_gpair>(dScans, scans, nVals);
|
||||
gpu_gpair* sums = new gpu_gpair[nKeys];
|
||||
sums[0].g = sums[0].h = (0.1f * 2.f * nVals);
|
||||
gpu_gpair* dSums;
|
||||
allocateAndUpdateOnGpu<gpu_gpair>(dSums, sums, nKeys);
|
||||
int* dColIds;
|
||||
allocateAndUpdateOnGpu<int>(dColIds, colIds, nVals);
|
||||
Split* splits = new Split[nKeys];
|
||||
Split* dSplits;
|
||||
allocateOnGpu<Split>(dSplits, nKeys);
|
||||
node_id_t* nodeAssigns = new node_id_t[nVals];
|
||||
memset(nodeAssigns, 0, sizeof(node_id_t)*nVals);
|
||||
node_id_t* dNodeAssigns;
|
||||
allocateAndUpdateOnGpu<node_id_t>(dNodeAssigns, nodeAssigns, nVals);
|
||||
Node<node_id_t>* nodes = new Node<node_id_t>[nKeys];
|
||||
nodes[0].gradSum = sums[0];
|
||||
nodes[0].id = 0;
|
||||
TrainParam param;
|
||||
param.min_child_weight = 0.0f;
|
||||
param.reg_alpha = 0.f;
|
||||
param.reg_lambda = 2.f;
|
||||
param.max_delta_step = 0.f;
|
||||
nodes[0].score = CalcGain(param, sums[0].g, sums[0].h);
|
||||
Node<node_id_t>* dNodes;
|
||||
allocateAndUpdateOnGpu<Node<node_id_t> >(dNodes, nodes, nKeys);
|
||||
argMaxByKey<node_id_t>(dSplits, dScans, dSums, dVals, dColIds, dNodeAssigns,
|
||||
dNodes, nKeys, 0, nVals, param, algo);
|
||||
updateHostPtr<Split>(splits, dSplits, nKeys);
|
||||
EXPECT_FLOAT_EQ(0.f, splits->score);
|
||||
EXPECT_EQ(0, splits->index);
|
||||
dh::safe_cuda(cudaFree(dNodeAssigns));
|
||||
delete [] nodeAssigns;
|
||||
dh::safe_cuda(cudaFree(dSplits));
|
||||
delete [] splits;
|
||||
dh::safe_cuda(cudaFree(dColIds));
|
||||
delete [] colIds;
|
||||
dh::safe_cuda(cudaFree(dSums));
|
||||
delete [] sums;
|
||||
dh::safe_cuda(cudaFree(dVals));
|
||||
delete [] vals;
|
||||
dh::safe_cuda(cudaFree(dScans));
|
||||
delete [] scans;
|
||||
}
|
||||
|
||||
TEST(ArgMaxByKey, testOneColGmem) {
|
||||
argMaxTest<int16_t>(ABK_GMEM);
|
||||
}
|
||||
|
||||
TEST(ArgMaxByKey, testOneColSmem) {
|
||||
argMaxTest<int16_t>(ABK_SMEM);
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
117
plugin/updater_gpu/test/cpp/fused_reduce_scan_by_key.cu
Normal file
117
plugin/updater_gpu/test/cpp/fused_reduce_scan_by_key.cu
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "gtest/gtest.h"
|
||||
#include "../../src/exact/fused_scan_reduce_by_key.cuh"
|
||||
#include "../../src/exact/node.cuh"
|
||||
#include "utils.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
template <typename node_id_t>
|
||||
class ReduceScanByKey: public Generator<node_id_t> {
|
||||
public:
|
||||
ReduceScanByKey(int nc, int nr, int nk, const std::string& tName):
|
||||
Generator<node_id_t>(nc, nr, nk, tName),
|
||||
hSums(nullptr), dSums(nullptr), hScans(nullptr), dScans(nullptr),
|
||||
outSize(this->size), nSegments(this->nKeys*this->nCols),
|
||||
hOffsets(nullptr), dOffsets(nullptr) {
|
||||
hSums = new gpu_gpair[nSegments];
|
||||
allocateOnGpu<gpu_gpair>(dSums, nSegments);
|
||||
hScans = new gpu_gpair[outSize];
|
||||
allocateOnGpu<gpu_gpair>(dScans, outSize);
|
||||
gpu_gpair* buckets = new gpu_gpair[nSegments];
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
buckets[i] = gpu_gpair();
|
||||
}
|
||||
for (int i = 0; i < nSegments; i++) {
|
||||
hSums[i] = gpu_gpair();
|
||||
}
|
||||
for (size_t i = 0; i < this->size; i++) {
|
||||
if (this->hKeys[i] >= 0 && this->hKeys[i] < nSegments) {
|
||||
node_id_t key = abs2uniqKey<node_id_t>(i, this->hKeys,
|
||||
this->hColIds, 0,
|
||||
this->nKeys);
|
||||
hSums[key] += this->hVals[i];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < this->size; ++i) {
|
||||
node_id_t key = abs2uniqKey<node_id_t>(i, this->hKeys,
|
||||
this->hColIds, 0,
|
||||
this->nKeys);
|
||||
hScans[i] = buckets[key];
|
||||
buckets[key] += this->hVals[i];
|
||||
}
|
||||
// it's a dense matrix that we are currently looking at, so offsets
|
||||
// are nicely aligned! (need not be the case in real datasets)
|
||||
hOffsets = new int[this->nCols];
|
||||
size_t off = 0;
|
||||
for (int i = 0; i < this->nCols; ++i, off+=this->nRows) {
|
||||
hOffsets[i] = off;
|
||||
}
|
||||
allocateAndUpdateOnGpu<int>(dOffsets, hOffsets, this->nCols);
|
||||
}
|
||||
|
||||
~ReduceScanByKey() {
|
||||
delete [] hScans;
|
||||
delete [] hSums;
|
||||
delete [] hOffsets;
|
||||
dh::safe_cuda(cudaFree(dScans));
|
||||
dh::safe_cuda(cudaFree(dSums));
|
||||
dh::safe_cuda(cudaFree(dOffsets));
|
||||
}
|
||||
|
||||
void run() {
|
||||
gpu_gpair* tmpScans;
|
||||
int* tmpKeys;
|
||||
int tmpSize = scanTempBufferSize(this->size);
|
||||
allocateOnGpu<gpu_gpair>(tmpScans, tmpSize);
|
||||
allocateOnGpu<int>(tmpKeys, tmpSize);
|
||||
TIMEIT(reduceScanByKey<node_id_t>
|
||||
(dSums, dScans, this->dVals, this->dInstIds, this->dKeys,
|
||||
this->size, this->nKeys, this->nCols, tmpScans, tmpKeys,
|
||||
this->dColIds, 0),
|
||||
this->testName);
|
||||
dh::safe_cuda(cudaFree(tmpScans));
|
||||
dh::safe_cuda(cudaFree(tmpKeys));
|
||||
this->compare(hSums, dSums, nSegments);
|
||||
this->compare(hScans, dScans, outSize);
|
||||
}
|
||||
|
||||
private:
|
||||
gpu_gpair* hSums;
|
||||
gpu_gpair* dSums;
|
||||
gpu_gpair* hScans;
|
||||
gpu_gpair* dScans;
|
||||
int outSize;
|
||||
int nSegments;
|
||||
int* hOffsets;
|
||||
int* dOffsets;
|
||||
};
|
||||
|
||||
TEST(ReduceScanByKey, testInt16) {
|
||||
ReduceScanByKey<int16_t>(32, 512, 32, "ReduceScanByKey").run();
|
||||
}
|
||||
|
||||
TEST(ReduceScanByKey, testInt32) {
|
||||
ReduceScanByKey<int>(32, 512, 32, "ReduceScanByKey").run();
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
1637
plugin/updater_gpu/test/cpp/generate_data.sh
Executable file
1637
plugin/updater_gpu/test/cpp/generate_data.sh
Executable file
File diff suppressed because it is too large
Load Diff
308
plugin/updater_gpu/test/cpp/gpu_builder.cu
Normal file
308
plugin/updater_gpu/test/cpp/gpu_builder.cu
Normal file
@@ -0,0 +1,308 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "gtest/gtest.h"
|
||||
#include "utils.cuh"
|
||||
#include "../../src/exact/gpu_builder.cuh"
|
||||
#include "../../src/exact/node.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
static const std::vector<int> smallColSizes = {0, 5, 0, 6, 4, 0, 0, 2, 0, 11,
|
||||
2, 9, 0, 5, 1, 0, 12, 3};
|
||||
|
||||
template <typename node_id_t>
|
||||
void testSmallData() {
|
||||
GPUBuilder<node_id_t> builder;
|
||||
std::shared_ptr<DMatrix> dm =
|
||||
setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
|
||||
builder, 1);
|
||||
// data dimensions
|
||||
ASSERT_EQ(60, builder.nVals);
|
||||
ASSERT_EQ(15, builder.nRows);
|
||||
ASSERT_EQ(18, builder.nCols);
|
||||
ASSERT_TRUE(builder.allocated);
|
||||
// column counts
|
||||
int* tmpOff = new int[builder.nCols+1];
|
||||
updateHostPtr<int>(tmpOff, builder.colOffsets.data(), builder.nCols+1);
|
||||
for (int i = 0; i < 15; ++i) {
|
||||
EXPECT_EQ(smallColSizes[i], tmpOff[i+1]-tmpOff[i]);
|
||||
}
|
||||
float* tmpVal = new float[builder.nVals];
|
||||
updateHostPtr<float>(tmpVal, builder.vals.current(), builder.nVals);
|
||||
int* tmpInst = new int[builder.nVals];
|
||||
updateHostPtr<int>(tmpInst, builder.instIds.current(), builder.nVals);
|
||||
gpu_gpair* tmpGrad = new gpu_gpair[builder.nRows];
|
||||
updateHostPtr<gpu_gpair>(tmpGrad, builder.gradsInst.data(), builder.nRows);
|
||||
EXPECT_EQ(0, tmpInst[0]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[0]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).h);
|
||||
EXPECT_EQ(2, tmpInst[1]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[1]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).h);
|
||||
EXPECT_EQ(7, tmpInst[2]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[2]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).h);
|
||||
delete [] tmpGrad;
|
||||
delete [] tmpOff;
|
||||
delete [] tmpInst;
|
||||
delete [] tmpVal;
|
||||
int* colIds = new int[builder.nVals];
|
||||
updateHostPtr<int>(colIds, builder.colIds.data(), builder.nVals);
|
||||
std::vector<int> colSizeCopy(smallColSizes);
|
||||
int colIdxCurr = 0;
|
||||
for (int i = 0; i < builder.nVals; ++i) {
|
||||
while (colSizeCopy[colIdxCurr] == 0) {
|
||||
++colIdxCurr;
|
||||
}
|
||||
--colSizeCopy[colIdxCurr];
|
||||
EXPECT_EQ(colIdxCurr, colIds[i]);
|
||||
}
|
||||
delete [] colIds;
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, SetupOneTimeDataSmallInt16) {
|
||||
testSmallData<int16_t>();
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, SetupOneTimeDataSmallInt32) {
|
||||
testSmallData<int>();
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void testLargeData() {
|
||||
GPUBuilder<node_id_t> builder;
|
||||
std::shared_ptr<DMatrix> dm =
|
||||
setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/sample.libsvm",
|
||||
builder, 1);
|
||||
ASSERT_EQ(35442, builder.nVals);
|
||||
ASSERT_EQ(1611, builder.nRows);
|
||||
ASSERT_EQ(127, builder.nCols);
|
||||
ASSERT_TRUE(builder.allocated);
|
||||
int* tmpOff = new int[builder.nCols+1];
|
||||
updateHostPtr<int>(tmpOff, builder.colOffsets.data(), builder.nCols+1);
|
||||
EXPECT_EQ(0, tmpOff[1]-tmpOff[0]); // 1st col
|
||||
EXPECT_EQ(83, tmpOff[2]-tmpOff[1]); // 2nd col
|
||||
EXPECT_EQ(1, tmpOff[3]-tmpOff[2]); // 3rd col
|
||||
float* tmpVal = new float[builder.nVals];
|
||||
updateHostPtr<float>(tmpVal, builder.vals.current(), builder.nVals);
|
||||
int* tmpInst = new int[builder.nVals];
|
||||
updateHostPtr<int>(tmpInst, builder.instIds.current(), builder.nVals);
|
||||
gpu_gpair* tmpGrad = new gpu_gpair[builder.nRows];
|
||||
updateHostPtr<gpu_gpair>(tmpGrad, builder.gradsInst.data(), builder.nRows);
|
||||
// the order of observations is messed up before the convertToCsc call!
|
||||
// hence, the instance IDs have been manually checked and put here.
|
||||
EXPECT_EQ(1164, tmpInst[0]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[0]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[0]%10), get(0, tmpGrad, tmpInst).h);
|
||||
EXPECT_EQ(1435, tmpInst[1]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[1]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[1]%10), get(1, tmpGrad, tmpInst).h);
|
||||
EXPECT_EQ(1421, tmpInst[2]);
|
||||
EXPECT_FLOAT_EQ(1.f, tmpVal[2]);
|
||||
EXPECT_FLOAT_EQ(1.f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).g);
|
||||
EXPECT_FLOAT_EQ(.5f+(float)(tmpInst[2]%10), get(2, tmpGrad, tmpInst).h);
|
||||
delete [] tmpGrad;
|
||||
delete [] tmpOff;
|
||||
delete [] tmpInst;
|
||||
delete [] tmpVal;
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, SetupOneTimeDataLargeInt16) {
|
||||
testLargeData<int16_t>();
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, SetupOneTimeDataLargeInt32) {
|
||||
testLargeData<int>();
|
||||
}
|
||||
|
||||
int getColId(int* offsets, int id, int nCols) {
|
||||
for (int i = 1; i <= nCols; ++i) {
|
||||
if (id < offsets[i]) {
|
||||
return (i-1);
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void testAllocate() {
|
||||
GPUBuilder<node_id_t> builder;
|
||||
std::shared_ptr<DMatrix> dm =
|
||||
setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
|
||||
builder, 1);
|
||||
ASSERT_EQ(3, builder.maxNodes);
|
||||
ASSERT_EQ(2, builder.maxLeaves);
|
||||
Node<node_id_t>* n = new Node<node_id_t>[builder.maxNodes];
|
||||
updateHostPtr<Node<node_id_t> >(n, builder.nodes.data(), builder.maxNodes);
|
||||
for (int i = 0; i < builder.maxNodes; ++i) {
|
||||
if (i == 0) {
|
||||
EXPECT_FALSE(n[i].isLeaf());
|
||||
EXPECT_FALSE(n[i].isUnused());
|
||||
} else {
|
||||
EXPECT_TRUE(n[i].isLeaf());
|
||||
EXPECT_FALSE(n[i].isUnused());
|
||||
}
|
||||
}
|
||||
gpu_gpair sum;
|
||||
sum.g = 0.f;
|
||||
sum.h = 0.f;
|
||||
for (int i = 0; i < builder.maxNodes; ++i) {
|
||||
if (!n[i].isUnused()) {
|
||||
sum += n[i].gradSum;
|
||||
}
|
||||
}
|
||||
// law of conservation of gradients! :)
|
||||
EXPECT_FLOAT_EQ(2.f*n[0].gradSum.g, sum.g);
|
||||
EXPECT_FLOAT_EQ(2.f*n[0].gradSum.h, sum.h);
|
||||
node_id_t* assigns = new node_id_t[builder.nVals];
|
||||
int* offsets = new int[builder.nCols+1];
|
||||
updateHostPtr<node_id_t>(assigns, builder.nodeAssigns.current(),
|
||||
builder.nVals);
|
||||
updateHostPtr<int>(offsets, builder.colOffsets.data(), builder.nCols+1);
|
||||
for (int i = 0; i < builder.nVals; ++i) {
|
||||
EXPECT_EQ((node_id_t)0, assigns[i]);
|
||||
}
|
||||
delete [] n;
|
||||
delete [] assigns;
|
||||
delete [] offsets;
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, AllocateNodeDataInt16) {
|
||||
testAllocate<int16_t>();
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, AllocateNodeDataInt32) {
|
||||
testAllocate<int>();
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void assign(Node<node_id_t> *n, float g, float h, float sc, float wt,
|
||||
DefaultDirection d, float th, int c, int i) {
|
||||
n->gradSum.g = g;
|
||||
n->gradSum.h = h;
|
||||
n->score = sc;
|
||||
n->weight = wt;
|
||||
n->dir = d;
|
||||
n->threshold = th;
|
||||
n->colIdx = c;
|
||||
n->id = (node_id_t)i;
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void testMarkLeaves() {
|
||||
GPUBuilder<node_id_t> builder;
|
||||
std::shared_ptr<DMatrix> dm =
|
||||
setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
|
||||
builder, 3);
|
||||
ASSERT_EQ(15, builder.maxNodes);
|
||||
ASSERT_EQ(8, builder.maxLeaves);
|
||||
Node<node_id_t>* hNodes = new Node<node_id_t>[builder.maxNodes];
|
||||
assign<node_id_t>(&hNodes[0], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.25f, 0, 0);
|
||||
assign<node_id_t>(&hNodes[1], 2.f, 1.f, .75f, 0.5f, RightDir, 0.5f, 1, 1);
|
||||
assign<node_id_t>(&hNodes[2], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.75f, 2, 2);
|
||||
assign<node_id_t>(&hNodes[3], 2.f, 1.f, .75f, 0.5f, RightDir, 1.f, 3, 3);
|
||||
assign<node_id_t>(&hNodes[4], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.25f, 4, 4);
|
||||
hNodes[5] = Node<node_id_t>();
|
||||
assign<node_id_t>(&hNodes[6], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 6, 6);
|
||||
hNodes[7] = Node<node_id_t>();
|
||||
hNodes[8] = Node<node_id_t>();
|
||||
hNodes[9] = Node<node_id_t>();
|
||||
hNodes[10] = Node<node_id_t>();
|
||||
hNodes[11] = Node<node_id_t>();
|
||||
hNodes[12] = Node<node_id_t>();
|
||||
hNodes[13] = Node<node_id_t>();
|
||||
hNodes[14] = Node<node_id_t>();
|
||||
updateDevicePtr<Node<node_id_t> >(builder.nodes.data(), hNodes, builder.maxNodes);
|
||||
builder.markLeaves();
|
||||
Node<node_id_t>* outNodes = new Node<node_id_t>[builder.maxNodes];
|
||||
updateHostPtr<Node<node_id_t> >(outNodes, builder.nodes.data(), builder.maxNodes);
|
||||
for (int i = 0; i < builder.maxNodes; ++i) {
|
||||
if ((i >= 7) || (i == 5)) {
|
||||
EXPECT_TRUE(outNodes[i].isUnused());
|
||||
} else {
|
||||
EXPECT_FALSE(outNodes[i].isUnused());
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < builder.maxNodes; ++i) {
|
||||
if ((i == 3) || (i == 4) || (i == 6)) {
|
||||
EXPECT_TRUE(outNodes[i].isLeaf());
|
||||
} else {
|
||||
EXPECT_FALSE(outNodes[i].isLeaf());
|
||||
}
|
||||
}
|
||||
delete [] outNodes;
|
||||
delete [] hNodes;
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, MarkLeavesInt16) {
|
||||
testMarkLeaves<int16_t>();
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, MarkLeavesInt32) {
|
||||
testMarkLeaves<int>();
|
||||
}
|
||||
|
||||
template <typename node_id_t>
|
||||
void testDense2Sparse() {
|
||||
GPUBuilder<node_id_t> builder;
|
||||
std::shared_ptr<DMatrix> dm =
|
||||
setupGPUBuilder<node_id_t>("plugin/updater_gpu/test/cpp/data/small.sample.libsvm",
|
||||
builder, 3);
|
||||
ASSERT_EQ(15, builder.maxNodes);
|
||||
ASSERT_EQ(8, builder.maxLeaves);
|
||||
Node<node_id_t>* hNodes = new Node<node_id_t>[builder.maxNodes];
|
||||
assign<node_id_t>(&hNodes[0], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.25f, 0, 0);
|
||||
assign<node_id_t>(&hNodes[1], 2.f, 1.f, .75f, 0.5f, RightDir, 0.5f, 1, 1);
|
||||
assign<node_id_t>(&hNodes[2], 2.f, 1.f, .75f, 0.5f, LeftDir, 0.75f, 2, 2);
|
||||
assign<node_id_t>(&hNodes[3], 2.f, 1.f, .75f, 0.5f, RightDir, 1.f, 3, 3);
|
||||
assign<node_id_t>(&hNodes[4], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.25f, 4, 4);
|
||||
hNodes[5] = Node<node_id_t>();
|
||||
assign<node_id_t>(&hNodes[6], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 6, 6);
|
||||
assign<node_id_t>(&hNodes[7], 2.f, 1.f, .75f, 0.5f, LeftDir, 1.75f, 7, 7);
|
||||
hNodes[8] = Node<node_id_t>();
|
||||
hNodes[9] = Node<node_id_t>();
|
||||
hNodes[10] = Node<node_id_t>();
|
||||
hNodes[11] = Node<node_id_t>();
|
||||
hNodes[12] = Node<node_id_t>();
|
||||
hNodes[13] = Node<node_id_t>();
|
||||
hNodes[14] = Node<node_id_t>();
|
||||
updateDevicePtr<Node<node_id_t> >(builder.nodes.data(), hNodes, builder.maxNodes);
|
||||
builder.markLeaves();
|
||||
RegTree tree;
|
||||
builder.dense2sparse(tree);
|
||||
EXPECT_EQ(9, tree.param.num_nodes);
|
||||
delete [] hNodes;
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, Dense2SparseInt16) {
|
||||
testDense2Sparse<int16_t>();
|
||||
}
|
||||
|
||||
TEST(CudaGPUBuilderTest, Dense2SparseInt32) {
|
||||
testDense2Sparse<int>();
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
64
plugin/updater_gpu/test/cpp/node.cu
Normal file
64
plugin/updater_gpu/test/cpp/node.cu
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "gtest/gtest.h"
|
||||
#include "../../src/exact/node.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
TEST(Split, Test) {
|
||||
Split s;
|
||||
EXPECT_FALSE(s.isSplittable(0.5f));
|
||||
s.score = 1.f;
|
||||
EXPECT_FALSE(s.isSplittable(0.5f));
|
||||
s.index = 2;
|
||||
EXPECT_TRUE(s.isSplittable(0.5f));
|
||||
EXPECT_FALSE(s.isSplittable(1.5f));
|
||||
}
|
||||
|
||||
TEST(Node, Test) {
|
||||
Node<int16_t> n;
|
||||
EXPECT_TRUE(n.isUnused());
|
||||
EXPECT_FALSE(n.isLeaf());
|
||||
EXPECT_TRUE(n.isDefaultLeft());
|
||||
n.dir = RightDir;
|
||||
EXPECT_TRUE(n.isUnused());
|
||||
EXPECT_FALSE(n.isLeaf());
|
||||
EXPECT_FALSE(n.isDefaultLeft());
|
||||
n.id = 123;
|
||||
EXPECT_FALSE(n.isUnused());
|
||||
EXPECT_TRUE(n.isLeaf());
|
||||
EXPECT_FALSE(n.isDefaultLeft());
|
||||
n.score = 0.5f;
|
||||
EXPECT_FALSE(n.isUnused());
|
||||
EXPECT_FALSE(n.isLeaf());
|
||||
EXPECT_FALSE(n.isDefaultLeft());
|
||||
}
|
||||
|
||||
TEST(Segment, Test) {
|
||||
Segment s;
|
||||
EXPECT_FALSE(s.isValid());
|
||||
s.start = 2;
|
||||
EXPECT_FALSE(s.isValid());
|
||||
s.end = 4;
|
||||
EXPECT_TRUE(s.isValid());
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
41
plugin/updater_gpu/test/cpp/utils.cu
Normal file
41
plugin/updater_gpu/test/cpp/utils.cu
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "utils.cuh"
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
std::shared_ptr<DMatrix> generateData(const std::string& file) {
|
||||
std::shared_ptr<DMatrix> data(DMatrix::Load(file, false, false, "libsvm"));
|
||||
return data;
|
||||
}
|
||||
|
||||
std::shared_ptr<DMatrix> preparePluginInputs(const std::string &file,
|
||||
std::vector<bst_gpair> *gpair) {
|
||||
std::shared_ptr<DMatrix> dm = generateData(file);
|
||||
gpair->reserve(dm->info().num_row);
|
||||
for (int i = 0; i < dm->info().num_row; ++i) {
|
||||
gpair->push_back(bst_gpair(1.f+static_cast<float>(i%10),
|
||||
0.5f+static_cast<float>(i%10)));
|
||||
}
|
||||
return dm;
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
232
plugin/updater_gpu/test/cpp/utils.cuh
Normal file
232
plugin/updater_gpu/test/cpp/utils.cuh
Normal file
@@ -0,0 +1,232 @@
|
||||
/*
|
||||
* Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include <random>
|
||||
#include "../../src/exact/gradients.cuh"
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <xgboost/data.h>
|
||||
#include "gtest/gtest.h"
|
||||
#include "../../src/exact/gpu_builder.cuh"
|
||||
#include "../../src/device_helpers.cuh"
|
||||
#include <vector>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
namespace xgboost {
|
||||
namespace tree {
|
||||
namespace exact {
|
||||
|
||||
|
||||
template <typename T>
|
||||
inline void allocateOnGpu(T*& arr, size_t nElems) {
|
||||
dh::safe_cuda(cudaMalloc((void**)&arr, sizeof(T)*nElems));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void updateDevicePtr(T* dArr, const T* hArr, size_t nElems) {
|
||||
dh::safe_cuda(cudaMemcpy(dArr, hArr, sizeof(T)*nElems, cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void updateHostPtr(T* hArr, const T* dArr, size_t nElems) {
|
||||
dh::safe_cuda(cudaMemcpy(hArr, dArr, sizeof(T)*nElems, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline void allocateAndUpdateOnGpu(T*& dArr, const T* hArr, size_t nElems) {
|
||||
allocateOnGpu<T>(dArr, nElems);
|
||||
updateDevicePtr<T>(dArr, hArr, nElems);
|
||||
}
|
||||
|
||||
static const float Thresh = 0.005f;
|
||||
static const float SuperSmall = 0.001f;
|
||||
static const float SuperSmallThresh = 0.00001f;
|
||||
|
||||
// lets assume dense matrix for simplicity
|
||||
template <typename T>
|
||||
class Generator {
|
||||
public:
|
||||
Generator(int nc, int nr, int nk, const std::string& tName):
|
||||
nCols(nc), nRows(nr), nKeys(nk), size(nc*nr), hKeys(nullptr),
|
||||
dKeys(nullptr), hVals(nullptr), dVals(nullptr), testName(tName),
|
||||
dColIds(nullptr), hColIds(nullptr), dInstIds(nullptr),
|
||||
hInstIds(nullptr) {
|
||||
generateKeys();
|
||||
generateVals();
|
||||
// to simulate the same sorted key-value pairs in the main code
|
||||
// don't need it as generateKeys always generates in sorted order!
|
||||
//sortKeyValues();
|
||||
evalColIds();
|
||||
evalInstIds();
|
||||
}
|
||||
|
||||
virtual ~Generator() {
|
||||
delete [] hKeys;
|
||||
delete [] hVals;
|
||||
delete [] hColIds;
|
||||
delete [] hInstIds;
|
||||
dh::safe_cuda(cudaFree(dColIds));
|
||||
dh::safe_cuda(cudaFree(dKeys));
|
||||
dh::safe_cuda(cudaFree(dVals));
|
||||
dh::safe_cuda(cudaFree(dInstIds));
|
||||
}
|
||||
|
||||
virtual void run() = 0;
|
||||
|
||||
protected:
|
||||
int nCols;
|
||||
int nRows;
|
||||
int nKeys;
|
||||
int size;
|
||||
T* hKeys;
|
||||
T* dKeys;
|
||||
gpu_gpair* hVals;
|
||||
gpu_gpair* dVals;
|
||||
std::string testName;
|
||||
int* dColIds;
|
||||
int* hColIds;
|
||||
int* dInstIds;
|
||||
int* hInstIds;
|
||||
|
||||
void evalColIds() {
|
||||
hColIds = new int[size];
|
||||
for (int i=0;i<size;++i) {
|
||||
hColIds[i] = i / nRows;
|
||||
}
|
||||
allocateAndUpdateOnGpu<int>(dColIds, hColIds, size);
|
||||
}
|
||||
|
||||
void evalInstIds() {
|
||||
hInstIds = new int[size];
|
||||
for (int i=0;i<size;++i) {
|
||||
hInstIds[i] = i;
|
||||
}
|
||||
allocateAndUpdateOnGpu<int>(dInstIds, hInstIds, size);
|
||||
}
|
||||
|
||||
float diffRatio(float a, float b, bool& isSmall) {
|
||||
isSmall = true;
|
||||
if (a == 0.f) return fabs(b);
|
||||
else if (b == 0.f) return fabs(a);
|
||||
else if ((fabs(a) < SuperSmall) && (fabs(b) < SuperSmall)) {
|
||||
return fabs(a - b);
|
||||
}
|
||||
else {
|
||||
isSmall = false;
|
||||
return fabs((a < b)? (b - a)/b : (a - b)/a);
|
||||
}
|
||||
}
|
||||
|
||||
void compare(gpu_gpair* exp, gpu_gpair* dAct, size_t len) {
|
||||
gpu_gpair* act = new gpu_gpair[len];
|
||||
updateHostPtr<gpu_gpair>(act, dAct, len);
|
||||
for (size_t i=0;i<len;++i) {
|
||||
bool isSmall;
|
||||
float ratioG = diffRatio(exp[i].g, act[i].g, isSmall);
|
||||
float ratioH = diffRatio(exp[i].h, act[i].h, isSmall);
|
||||
float thresh = isSmall? SuperSmallThresh : Thresh;
|
||||
if ((ratioG >= Thresh) || (ratioH >= Thresh)) {
|
||||
printf("(exp) %f %f -> (act) %f %f : rG=%f rH=%f th=%f @%lu\n",
|
||||
exp[i].g, exp[i].h, act[i].g, act[i].h, ratioG, ratioH,
|
||||
thresh, i);
|
||||
}
|
||||
ASSERT_TRUE(ratioG < thresh);
|
||||
ASSERT_TRUE(ratioH < thresh);
|
||||
}
|
||||
delete [] act;
|
||||
}
|
||||
|
||||
void generateKeys() {
|
||||
hKeys = new T[size];
|
||||
T currKey = 0;
|
||||
for (int i=0;i<size;++i) {
|
||||
if (i % nRows == 0) { // start fresh for a new column
|
||||
currKey = 0;
|
||||
}
|
||||
hKeys[i] = currKey;
|
||||
float val = randVal();
|
||||
if ((val > 0.8f) && (currKey < nKeys-1)) {
|
||||
++currKey;
|
||||
}
|
||||
}
|
||||
allocateAndUpdateOnGpu<T>(dKeys, hKeys, size);
|
||||
}
|
||||
|
||||
void generateVals() {
|
||||
hVals = new gpu_gpair[size];
|
||||
for (size_t i=0;i<size;++i) {
|
||||
hVals[i].g = randVal(-1.f, 1.f);
|
||||
hVals[i].h = randVal(-1.f, 1.f);
|
||||
}
|
||||
allocateAndUpdateOnGpu<gpu_gpair>(dVals, hVals, size);
|
||||
}
|
||||
|
||||
void sortKeyValues() {
|
||||
char* storage = nullptr;
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceRadixSort::SortPairs(NULL, tmpSize, dKeys, dKeys,
|
||||
dVals, dVals, size));
|
||||
allocateOnGpu<char>(storage, tmpSize);
|
||||
void* tmpStorage = static_cast<void*>(storage);
|
||||
dh::safe_cuda(cub::DeviceRadixSort::SortPairs(tmpStorage, tmpSize, dKeys,
|
||||
dKeys, dVals, dVals, size));
|
||||
dh::safe_cuda(cudaFree(storage));
|
||||
updateHostPtr<gpu_gpair>(hVals, dVals, size);
|
||||
updateHostPtr<T>(hKeys, dKeys, size);
|
||||
}
|
||||
|
||||
float randVal() const {
|
||||
float val = rand() * 1.f / RAND_MAX;
|
||||
return val;
|
||||
}
|
||||
|
||||
float randVal(float min, float max) const {
|
||||
float val = randVal();
|
||||
val = (val * (max - min)) - min;
|
||||
return val;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
std::shared_ptr<DMatrix> generateData(const std::string& file);
|
||||
|
||||
std::shared_ptr<DMatrix> preparePluginInputs(const std::string& file,
|
||||
std::vector<bst_gpair> *gpair);
|
||||
|
||||
template <typename node_id_t>
|
||||
std::shared_ptr<DMatrix> setupGPUBuilder(const std::string& file,
|
||||
GPUBuilder<node_id_t> &builder,
|
||||
int max_depth=1) {
|
||||
std::vector<bst_gpair> gpair;
|
||||
std::shared_ptr<DMatrix> dm = preparePluginInputs(file, &gpair);
|
||||
TrainParam p;
|
||||
RegTree tree;
|
||||
p.min_split_loss = 0.f;
|
||||
p.max_depth = max_depth;
|
||||
p.min_child_weight = 0.f;
|
||||
p.reg_alpha = 0.f;
|
||||
p.reg_lambda = 1.f;
|
||||
p.max_delta_step = 0.f;
|
||||
builder.Init(p);
|
||||
builder.Update(gpair, dm.get(), &tree);
|
||||
return dm;
|
||||
}
|
||||
|
||||
} // namespace exact
|
||||
} // namespace tree
|
||||
} // namespace xgboost
|
||||
Reference in New Issue
Block a user