Various bug fixes (#2825)

* Fatal error if GPU algorithm selected without GPU support compiled

* Resolve type conversion warnings

* Fix gpu unit test failure

* Fix compressed iterator edge case

* Fix python unit test failures due to flake8 update on pip
This commit is contained in:
Rory Mitchell
2017-10-25 14:45:01 +13:00
committed by GitHub
parent c71b62d48d
commit 13e7a2cff0
21 changed files with 163 additions and 180 deletions

View File

@@ -241,7 +241,7 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
template <typename TrainingParams, typename T>
XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
if (sum_hess < p.min_child_weight)
return 0.0;
return T(0.0);
if (p.max_delta_step == 0.0f) {
if (p.reg_alpha == 0.0f) {
return Sqr(sum_grad) / (sum_hess + p.reg_lambda);
@@ -251,11 +251,11 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess
}
} else {
T w = CalcWeight(p, sum_grad, sum_hess);
T ret = sum_grad * w + 0.5 * (sum_hess + p.reg_lambda) * Sqr(w);
T ret = sum_grad * w + T(0.5) * (sum_hess + p.reg_lambda) * Sqr(w);
if (p.reg_alpha == 0.0f) {
return -2.0 * ret;
return T(-2.0) * ret;
} else {
return -2.0 * (ret + p.reg_alpha * std::abs(w));
return T(-2.0) * (ret + p.reg_alpha * std::abs(w));
}
}
}

View File

@@ -630,7 +630,8 @@ class GPUMaker : public TreeUpdater {
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
}
std::vector<float> fval;
std::vector<int> fId, offset;
std::vector<int> fId;
std::vector<size_t> offset;
convertToCsc(dmat, &fval, &fId, &offset);
allocateAllData(static_cast<int>(offset.size()));
transferAndSortData(fval, fId, offset);
@@ -638,10 +639,12 @@ class GPUMaker : public TreeUpdater {
}
void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
std::vector<int>* fId, std::vector<int>* offset) {
std::vector<int>* fId, std::vector<size_t>* offset) {
MetaInfo info = dmat->info();
nRows = info.num_row;
nCols = info.num_col;
CHECK(info.num_col < std::numeric_limits<int>::max());
CHECK(info.num_row < std::numeric_limits<int>::max());
nRows = static_cast<int>(info.num_row);
nCols = static_cast<int>(info.num_col);
offset->reserve(nCols + 1);
offset->push_back(0);
fval->reserve(nCols * nRows);
@@ -667,12 +670,13 @@ class GPUMaker : public TreeUpdater {
offset->push_back(fval->size());
}
}
nVals = fval->size();
CHECK(fval->size() < std::numeric_limits<int>::max());
nVals = static_cast<int>(fval->size());
}
void transferAndSortData(const std::vector<float>& fval,
const std::vector<int>& fId,
const std::vector<int>& offset) {
const std::vector<size_t>& offset) {
vals.current_dvec() = fval;
instIds.current_dvec() = fId;
colOffsets = offset;

View File

@@ -104,7 +104,7 @@ struct DeviceHist {
template <int BLOCK_THREADS>
__global__ void find_split_kernel(
const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth,
int n_features, int n_bins, DeviceNodeStats* d_nodes,
uint64_t n_features, int n_bins, DeviceNodeStats* d_nodes,
int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map,
GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp,
bool colsample, int* d_feature_flags) {
@@ -293,7 +293,8 @@ class GPUHistMaker : public TreeUpdater {
dh::Timer time1;
// set member num_rows and n_devices for rest of GPUHistBuilder members
info = &fmat.info();
num_rows = info->num_row;
CHECK(info->num_row < std::numeric_limits<bst_uint>::max());
num_rows = static_cast<bst_uint>(info->num_row);
n_devices = dh::n_devices(param.n_gpus, num_rows);
if (!initialised) {
@@ -396,15 +397,15 @@ class GPUHistMaker : public TreeUpdater {
fflush(stdout);
}
int n_bins = hmat_.row_ptr.back();
int n_features = hmat_.row_ptr.size() - 1;
int n_bins = static_cast<int >(hmat_.row_ptr.back());
int n_features = static_cast<int >(hmat_.row_ptr.size() - 1);
// deliniate data onto multiple gpus
device_row_segments.push_back(0);
device_element_segments.push_back(0);
bst_uint offset = 0;
bst_uint shard_size =
std::ceil(static_cast<double>(num_rows) / n_devices);
bst_uint shard_size = static_cast<bst_uint>(
std::ceil(static_cast<double>(num_rows) / n_devices));
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
int device_idx = dList[d_idx];
offset += shard_size;
@@ -425,7 +426,7 @@ class GPUHistMaker : public TreeUpdater {
// Construct feature map
std::vector<int> h_gidx_feature_map(n_bins);
for (int fidx = 0; fidx < n_features; fidx++) {
for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
for (auto i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
h_gidx_feature_map[i] = fidx;
}
}
@@ -456,7 +457,7 @@ class GPUHistMaker : public TreeUpdater {
gidx_feature_map.resize(n_devices);
gidx_fvalue_map.resize(n_devices);
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
int find_split_n_devices = static_cast<int >(std::pow(2, std::floor(std::log2(n_devices))));
find_split_n_devices =
std::min(n_nodes_level(param.max_depth), find_split_n_devices);
int max_num_nodes_device =
@@ -707,7 +708,7 @@ class GPUHistMaker : public TreeUpdater {
int nodes_offset_device = 0;
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
hist_vec[d_idx].GetLevelPtr(depth), feature_segments[d_idx].data(),
depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(),
depth, info->num_col, hmat_.row_ptr.back(), nodes[d_idx].data(),
nodes_offset_device, fidx_min_map[d_idx].data(),
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
left_child_smallest[d_idx].data(), colsample,
@@ -769,7 +770,7 @@ class GPUHistMaker : public TreeUpdater {
DeviceNodeStats* d_nodes = nodes[d_idx].data();
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
auto d_gidx = device_matrix[d_idx].gidx;
int n_columns = info->num_col;
auto n_columns = info->num_col;
size_t begin = device_row_segments[d_idx];
size_t end = device_row_segments[d_idx + 1];

View File

@@ -113,13 +113,11 @@ __device__ void EvaluateFeature(int fidx, const bst_gpair_integer* hist,
}
template <int BLOCK_THREADS>
__global__ void evaluate_split_kernel(const bst_gpair_integer* d_hist, int nidx,
int n_features, DeviceNodeStats nodes,
const int* d_feature_segments,
const float* d_fidx_min_map,
const float* d_gidx_fvalue_map,
GPUTrainingParam gpu_param,
DeviceSplitCandidate* d_split) {
__global__ void evaluate_split_kernel(
const bst_gpair_integer* d_hist, int nidx, uint64_t n_features,
DeviceNodeStats nodes, const int* d_feature_segments,
const float* d_fidx_min_map, const float* d_gidx_fvalue_map,
GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split) {
typedef cub::KeyValuePair<int, float> ArgMaxT;
typedef cub::BlockScan<bst_gpair_integer, BLOCK_THREADS,
cub::BLOCK_SCAN_WARP_SCANS>
@@ -190,24 +188,6 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data,
return -1;
}
template <int BLOCK_THREADS>
__global__ void RadixSortSmall(bst_uint* d_ridx, int* d_position, bst_uint n) {
typedef cub::BlockRadixSort<int, BLOCK_THREADS, 1, bst_uint> BlockRadixSort;
__shared__ typename BlockRadixSort::TempStorage temp_storage;
bool thread_active = threadIdx.x < n;
int thread_key[1];
bst_uint thread_value[1];
thread_key[0] = thread_active ? d_position[threadIdx.x] : INT_MAX;
thread_value[0] = thread_active ? d_ridx[threadIdx.x] : UINT_MAX;
BlockRadixSort(temp_storage).Sort(thread_key, thread_value);
if (thread_active) {
d_position[threadIdx.x] = thread_key[0];
d_ridx[threadIdx.x] = thread_value[0];
}
}
struct DeviceHistogram {
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
dh::dvec<bst_gpair_integer> data;
@@ -269,7 +249,7 @@ struct DeviceShard {
null_gidx_value(n_bins) {
// Convert to ELLPACK matrix representation
int max_elements_row = 0;
for (int i = row_begin; i < row_end; i++) {
for (auto i = row_begin; i < row_end; i++) {
max_elements_row =
(std::max)(max_elements_row,
static_cast<int>(gmat.row_ptr[i + 1] - gmat.row_ptr[i]));
@@ -277,9 +257,9 @@ struct DeviceShard {
row_stride = max_elements_row;
std::vector<int> ellpack_matrix(row_stride * n_rows, null_gidx_value);
for (int i = row_begin; i < row_end; i++) {
for (auto i = row_begin; i < row_end; i++) {
int row_count = 0;
for (int j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
ellpack_matrix[i * row_stride + row_count] = gmat.index[j];
row_count++;
}
@@ -394,13 +374,8 @@ struct DeviceShard {
int right_nidx) {
auto n = segment.second - segment.first;
int min_bits = 0;
int max_bits = std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1));
// const int SINGLE_TILE_SIZE = 1024;
// if (n < SINGLE_TILE_SIZE) {
// RadixSortSmall<SINGLE_TILE_SIZE>
// <<<1, SINGLE_TILE_SIZE>>>(ridx.current() + segment.first,
// position.current() + segment.first, n);
//} else {
int max_bits = static_cast<int>(
std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1)));
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(
@@ -509,7 +484,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
nidx_set.size());
auto d_split = shard.temp_memory.Pointer<DeviceSplitCandidate>();
auto& streams = shard.GetStreams(nidx_set.size());
auto& streams = shard.GetStreams(static_cast<int>(nidx_set.size()));
// Use streams to process nodes concurrently
for (auto i = 0; i < nidx_set.size(); i++) {
@@ -518,7 +493,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
const int BLOCK_THREADS = 256;
evaluate_split_kernel<BLOCK_THREADS>
<<<columns, BLOCK_THREADS, 0, streams[i]>>>(
<<<uint32_t(columns), BLOCK_THREADS, 0, streams[i]>>>(
shard.hist.node_map[nidx], nidx, info->num_col, node,
shard.feature_segments.data(), shard.min_fvalue.data(),
shard.gidx_fvalue_map.data(), GPUTrainingParam(param),
@@ -573,10 +548,11 @@ class GPUHistMakerExperimental : public TreeUpdater {
__host__ __device__ int operator()(int x) const { return x == val; }
};
__device__ void CountLeft(bst_uint* d_count, int val, int left_nidx) {
__device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
unsigned ballot = __ballot(val == left_nidx);
if (threadIdx.x % 32 == 0) {
atomicAdd(d_count, __popc(ballot));
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
}
}
@@ -601,9 +577,9 @@ class GPUHistMakerExperimental : public TreeUpdater {
for (auto& shard : shards) {
monitor.Start("update position kernel");
shard.temp_memory.LazyAllocate(sizeof(bst_uint));
auto d_left_count = shard.temp_memory.Pointer<bst_uint>();
dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(bst_uint)));
shard.temp_memory.LazyAllocate(sizeof(int64_t));
auto d_left_count = shard.temp_memory.Pointer<int64_t>();
dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t)));
dh::safe_cuda(cudaSetDevice(shard.device_idx));
auto segment = shard.ridx_segments[nidx];
CHECK_GT(segment.second - segment.first, 0);
@@ -639,8 +615,8 @@ class GPUHistMakerExperimental : public TreeUpdater {
d_position[idx] = position;
});
bst_uint left_count;
dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(bst_uint),
int64_t left_count;
dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost));
monitor.Stop("update position kernel");
@@ -722,7 +698,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
this->InitRoot(gpair, p_tree);
monitor.Stop("InitRoot");
unsigned timestamp = qexpand_->size();
auto timestamp = qexpand_->size();
auto num_leaves = 1;
while (!qexpand_->empty()) {
@@ -764,9 +740,9 @@ class GPUHistMakerExperimental : public TreeUpdater {
int nid;
int depth;
DeviceSplitCandidate split;
unsigned timestamp;
uint64_t timestamp;
ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split,
unsigned timestamp)
uint64_t timestamp)
: nid(nid), depth(depth), split(split), timestamp(timestamp) {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= rt_eps) return false;