Various bug fixes (#2825)
* Fatal error if GPU algorithm selected without GPU support compiled * Resolve type conversion warnings * Fix gpu unit test failure * Fix compressed iterator edge case * Fix python unit test failures due to flake8 update on pip
This commit is contained in:
@@ -241,7 +241,7 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
|
||||
template <typename TrainingParams, typename T>
|
||||
XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) {
|
||||
if (sum_hess < p.min_child_weight)
|
||||
return 0.0;
|
||||
return T(0.0);
|
||||
if (p.max_delta_step == 0.0f) {
|
||||
if (p.reg_alpha == 0.0f) {
|
||||
return Sqr(sum_grad) / (sum_hess + p.reg_lambda);
|
||||
@@ -251,11 +251,11 @@ XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess
|
||||
}
|
||||
} else {
|
||||
T w = CalcWeight(p, sum_grad, sum_hess);
|
||||
T ret = sum_grad * w + 0.5 * (sum_hess + p.reg_lambda) * Sqr(w);
|
||||
T ret = sum_grad * w + T(0.5) * (sum_hess + p.reg_lambda) * Sqr(w);
|
||||
if (p.reg_alpha == 0.0f) {
|
||||
return -2.0 * ret;
|
||||
return T(-2.0) * ret;
|
||||
} else {
|
||||
return -2.0 * (ret + p.reg_alpha * std::abs(w));
|
||||
return T(-2.0) * (ret + p.reg_alpha * std::abs(w));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -630,7 +630,8 @@ class GPUMaker : public TreeUpdater {
|
||||
throw std::runtime_error("exact::GPUBuilder - must have 1 column block");
|
||||
}
|
||||
std::vector<float> fval;
|
||||
std::vector<int> fId, offset;
|
||||
std::vector<int> fId;
|
||||
std::vector<size_t> offset;
|
||||
convertToCsc(dmat, &fval, &fId, &offset);
|
||||
allocateAllData(static_cast<int>(offset.size()));
|
||||
transferAndSortData(fval, fId, offset);
|
||||
@@ -638,10 +639,12 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void convertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
||||
std::vector<int>* fId, std::vector<int>* offset) {
|
||||
std::vector<int>* fId, std::vector<size_t>* offset) {
|
||||
MetaInfo info = dmat->info();
|
||||
nRows = info.num_row;
|
||||
nCols = info.num_col;
|
||||
CHECK(info.num_col < std::numeric_limits<int>::max());
|
||||
CHECK(info.num_row < std::numeric_limits<int>::max());
|
||||
nRows = static_cast<int>(info.num_row);
|
||||
nCols = static_cast<int>(info.num_col);
|
||||
offset->reserve(nCols + 1);
|
||||
offset->push_back(0);
|
||||
fval->reserve(nCols * nRows);
|
||||
@@ -667,12 +670,13 @@ class GPUMaker : public TreeUpdater {
|
||||
offset->push_back(fval->size());
|
||||
}
|
||||
}
|
||||
nVals = fval->size();
|
||||
CHECK(fval->size() < std::numeric_limits<int>::max());
|
||||
nVals = static_cast<int>(fval->size());
|
||||
}
|
||||
|
||||
void transferAndSortData(const std::vector<float>& fval,
|
||||
const std::vector<int>& fId,
|
||||
const std::vector<int>& offset) {
|
||||
const std::vector<size_t>& offset) {
|
||||
vals.current_dvec() = fval;
|
||||
instIds.current_dvec() = fId;
|
||||
colOffsets = offset;
|
||||
|
||||
@@ -104,7 +104,7 @@ struct DeviceHist {
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ void find_split_kernel(
|
||||
const gpair_sum_t* d_level_hist, int* d_feature_segments, int depth,
|
||||
int n_features, int n_bins, DeviceNodeStats* d_nodes,
|
||||
uint64_t n_features, int n_bins, DeviceNodeStats* d_nodes,
|
||||
int nodes_offset_device, float* d_fidx_min_map, float* d_gidx_fvalue_map,
|
||||
GPUTrainingParam gpu_param, bool* d_left_child_smallest_temp,
|
||||
bool colsample, int* d_feature_flags) {
|
||||
@@ -293,7 +293,8 @@ class GPUHistMaker : public TreeUpdater {
|
||||
dh::Timer time1;
|
||||
// set member num_rows and n_devices for rest of GPUHistBuilder members
|
||||
info = &fmat.info();
|
||||
num_rows = info->num_row;
|
||||
CHECK(info->num_row < std::numeric_limits<bst_uint>::max());
|
||||
num_rows = static_cast<bst_uint>(info->num_row);
|
||||
n_devices = dh::n_devices(param.n_gpus, num_rows);
|
||||
|
||||
if (!initialised) {
|
||||
@@ -396,15 +397,15 @@ class GPUHistMaker : public TreeUpdater {
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
int n_bins = hmat_.row_ptr.back();
|
||||
int n_features = hmat_.row_ptr.size() - 1;
|
||||
int n_bins = static_cast<int >(hmat_.row_ptr.back());
|
||||
int n_features = static_cast<int >(hmat_.row_ptr.size() - 1);
|
||||
|
||||
// deliniate data onto multiple gpus
|
||||
device_row_segments.push_back(0);
|
||||
device_element_segments.push_back(0);
|
||||
bst_uint offset = 0;
|
||||
bst_uint shard_size =
|
||||
std::ceil(static_cast<double>(num_rows) / n_devices);
|
||||
bst_uint shard_size = static_cast<bst_uint>(
|
||||
std::ceil(static_cast<double>(num_rows) / n_devices));
|
||||
for (int d_idx = 0; d_idx < n_devices; d_idx++) {
|
||||
int device_idx = dList[d_idx];
|
||||
offset += shard_size;
|
||||
@@ -425,7 +426,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
// Construct feature map
|
||||
std::vector<int> h_gidx_feature_map(n_bins);
|
||||
for (int fidx = 0; fidx < n_features; fidx++) {
|
||||
for (int i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
|
||||
for (auto i = hmat_.row_ptr[fidx]; i < hmat_.row_ptr[fidx + 1]; i++) {
|
||||
h_gidx_feature_map[i] = fidx;
|
||||
}
|
||||
}
|
||||
@@ -456,7 +457,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
gidx_feature_map.resize(n_devices);
|
||||
gidx_fvalue_map.resize(n_devices);
|
||||
|
||||
int find_split_n_devices = std::pow(2, std::floor(std::log2(n_devices)));
|
||||
int find_split_n_devices = static_cast<int >(std::pow(2, std::floor(std::log2(n_devices))));
|
||||
find_split_n_devices =
|
||||
std::min(n_nodes_level(param.max_depth), find_split_n_devices);
|
||||
int max_num_nodes_device =
|
||||
@@ -707,7 +708,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
int nodes_offset_device = 0;
|
||||
find_split_kernel<BLOCK_THREADS><<<GRID_SIZE, BLOCK_THREADS>>>(
|
||||
hist_vec[d_idx].GetLevelPtr(depth), feature_segments[d_idx].data(),
|
||||
depth, (info->num_col), (hmat_.row_ptr.back()), nodes[d_idx].data(),
|
||||
depth, info->num_col, hmat_.row_ptr.back(), nodes[d_idx].data(),
|
||||
nodes_offset_device, fidx_min_map[d_idx].data(),
|
||||
gidx_fvalue_map[d_idx].data(), GPUTrainingParam(param),
|
||||
left_child_smallest[d_idx].data(), colsample,
|
||||
@@ -769,7 +770,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
DeviceNodeStats* d_nodes = nodes[d_idx].data();
|
||||
auto d_gidx_fvalue_map = gidx_fvalue_map[d_idx].data();
|
||||
auto d_gidx = device_matrix[d_idx].gidx;
|
||||
int n_columns = info->num_col;
|
||||
auto n_columns = info->num_col;
|
||||
size_t begin = device_row_segments[d_idx];
|
||||
size_t end = device_row_segments[d_idx + 1];
|
||||
|
||||
|
||||
@@ -113,13 +113,11 @@ __device__ void EvaluateFeature(int fidx, const bst_gpair_integer* hist,
|
||||
}
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ void evaluate_split_kernel(const bst_gpair_integer* d_hist, int nidx,
|
||||
int n_features, DeviceNodeStats nodes,
|
||||
const int* d_feature_segments,
|
||||
const float* d_fidx_min_map,
|
||||
const float* d_gidx_fvalue_map,
|
||||
GPUTrainingParam gpu_param,
|
||||
DeviceSplitCandidate* d_split) {
|
||||
__global__ void evaluate_split_kernel(
|
||||
const bst_gpair_integer* d_hist, int nidx, uint64_t n_features,
|
||||
DeviceNodeStats nodes, const int* d_feature_segments,
|
||||
const float* d_fidx_min_map, const float* d_gidx_fvalue_map,
|
||||
GPUTrainingParam gpu_param, DeviceSplitCandidate* d_split) {
|
||||
typedef cub::KeyValuePair<int, float> ArgMaxT;
|
||||
typedef cub::BlockScan<bst_gpair_integer, BLOCK_THREADS,
|
||||
cub::BLOCK_SCAN_WARP_SCANS>
|
||||
@@ -190,24 +188,6 @@ __device__ int BinarySearchRow(bst_uint begin, bst_uint end, gidx_iter_t data,
|
||||
return -1;
|
||||
}
|
||||
|
||||
template <int BLOCK_THREADS>
|
||||
__global__ void RadixSortSmall(bst_uint* d_ridx, int* d_position, bst_uint n) {
|
||||
typedef cub::BlockRadixSort<int, BLOCK_THREADS, 1, bst_uint> BlockRadixSort;
|
||||
__shared__ typename BlockRadixSort::TempStorage temp_storage;
|
||||
|
||||
bool thread_active = threadIdx.x < n;
|
||||
int thread_key[1];
|
||||
bst_uint thread_value[1];
|
||||
thread_key[0] = thread_active ? d_position[threadIdx.x] : INT_MAX;
|
||||
thread_value[0] = thread_active ? d_ridx[threadIdx.x] : UINT_MAX;
|
||||
BlockRadixSort(temp_storage).Sort(thread_key, thread_value);
|
||||
|
||||
if (thread_active) {
|
||||
d_position[threadIdx.x] = thread_key[0];
|
||||
d_ridx[threadIdx.x] = thread_value[0];
|
||||
}
|
||||
}
|
||||
|
||||
struct DeviceHistogram {
|
||||
dh::bulk_allocator<dh::memory_type::DEVICE> ba;
|
||||
dh::dvec<bst_gpair_integer> data;
|
||||
@@ -269,7 +249,7 @@ struct DeviceShard {
|
||||
null_gidx_value(n_bins) {
|
||||
// Convert to ELLPACK matrix representation
|
||||
int max_elements_row = 0;
|
||||
for (int i = row_begin; i < row_end; i++) {
|
||||
for (auto i = row_begin; i < row_end; i++) {
|
||||
max_elements_row =
|
||||
(std::max)(max_elements_row,
|
||||
static_cast<int>(gmat.row_ptr[i + 1] - gmat.row_ptr[i]));
|
||||
@@ -277,9 +257,9 @@ struct DeviceShard {
|
||||
row_stride = max_elements_row;
|
||||
std::vector<int> ellpack_matrix(row_stride * n_rows, null_gidx_value);
|
||||
|
||||
for (int i = row_begin; i < row_end; i++) {
|
||||
for (auto i = row_begin; i < row_end; i++) {
|
||||
int row_count = 0;
|
||||
for (int j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
|
||||
for (auto j = gmat.row_ptr[i]; j < gmat.row_ptr[i + 1]; j++) {
|
||||
ellpack_matrix[i * row_stride + row_count] = gmat.index[j];
|
||||
row_count++;
|
||||
}
|
||||
@@ -394,13 +374,8 @@ struct DeviceShard {
|
||||
int right_nidx) {
|
||||
auto n = segment.second - segment.first;
|
||||
int min_bits = 0;
|
||||
int max_bits = std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1));
|
||||
// const int SINGLE_TILE_SIZE = 1024;
|
||||
// if (n < SINGLE_TILE_SIZE) {
|
||||
// RadixSortSmall<SINGLE_TILE_SIZE>
|
||||
// <<<1, SINGLE_TILE_SIZE>>>(ridx.current() + segment.first,
|
||||
// position.current() + segment.first, n);
|
||||
//} else {
|
||||
int max_bits = static_cast<int>(
|
||||
std::ceil(std::log2((std::max)(left_nidx, right_nidx) + 1)));
|
||||
|
||||
size_t temp_storage_bytes = 0;
|
||||
cub::DeviceRadixSort::SortPairs(
|
||||
@@ -509,7 +484,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
nidx_set.size());
|
||||
auto d_split = shard.temp_memory.Pointer<DeviceSplitCandidate>();
|
||||
|
||||
auto& streams = shard.GetStreams(nidx_set.size());
|
||||
auto& streams = shard.GetStreams(static_cast<int>(nidx_set.size()));
|
||||
|
||||
// Use streams to process nodes concurrently
|
||||
for (auto i = 0; i < nidx_set.size(); i++) {
|
||||
@@ -518,7 +493,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
|
||||
const int BLOCK_THREADS = 256;
|
||||
evaluate_split_kernel<BLOCK_THREADS>
|
||||
<<<columns, BLOCK_THREADS, 0, streams[i]>>>(
|
||||
<<<uint32_t(columns), BLOCK_THREADS, 0, streams[i]>>>(
|
||||
shard.hist.node_map[nidx], nidx, info->num_col, node,
|
||||
shard.feature_segments.data(), shard.min_fvalue.data(),
|
||||
shard.gidx_fvalue_map.data(), GPUTrainingParam(param),
|
||||
@@ -573,10 +548,11 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
__host__ __device__ int operator()(int x) const { return x == val; }
|
||||
};
|
||||
|
||||
__device__ void CountLeft(bst_uint* d_count, int val, int left_nidx) {
|
||||
__device__ void CountLeft(int64_t* d_count, int val, int left_nidx) {
|
||||
unsigned ballot = __ballot(val == left_nidx);
|
||||
if (threadIdx.x % 32 == 0) {
|
||||
atomicAdd(d_count, __popc(ballot));
|
||||
atomicAdd(reinterpret_cast<unsigned long long*>(d_count), // NOLINT
|
||||
static_cast<unsigned long long>(__popc(ballot))); // NOLINT
|
||||
}
|
||||
}
|
||||
|
||||
@@ -601,9 +577,9 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
|
||||
for (auto& shard : shards) {
|
||||
monitor.Start("update position kernel");
|
||||
shard.temp_memory.LazyAllocate(sizeof(bst_uint));
|
||||
auto d_left_count = shard.temp_memory.Pointer<bst_uint>();
|
||||
dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(bst_uint)));
|
||||
shard.temp_memory.LazyAllocate(sizeof(int64_t));
|
||||
auto d_left_count = shard.temp_memory.Pointer<int64_t>();
|
||||
dh::safe_cuda(cudaMemset(d_left_count, 0, sizeof(int64_t)));
|
||||
dh::safe_cuda(cudaSetDevice(shard.device_idx));
|
||||
auto segment = shard.ridx_segments[nidx];
|
||||
CHECK_GT(segment.second - segment.first, 0);
|
||||
@@ -639,8 +615,8 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
d_position[idx] = position;
|
||||
});
|
||||
|
||||
bst_uint left_count;
|
||||
dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(bst_uint),
|
||||
int64_t left_count;
|
||||
dh::safe_cuda(cudaMemcpy(&left_count, d_left_count, sizeof(int64_t),
|
||||
cudaMemcpyDeviceToHost));
|
||||
monitor.Stop("update position kernel");
|
||||
|
||||
@@ -722,7 +698,7 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
this->InitRoot(gpair, p_tree);
|
||||
monitor.Stop("InitRoot");
|
||||
|
||||
unsigned timestamp = qexpand_->size();
|
||||
auto timestamp = qexpand_->size();
|
||||
auto num_leaves = 1;
|
||||
|
||||
while (!qexpand_->empty()) {
|
||||
@@ -764,9 +740,9 @@ class GPUHistMakerExperimental : public TreeUpdater {
|
||||
int nid;
|
||||
int depth;
|
||||
DeviceSplitCandidate split;
|
||||
unsigned timestamp;
|
||||
uint64_t timestamp;
|
||||
ExpandEntry(int nid, int depth, const DeviceSplitCandidate& split,
|
||||
unsigned timestamp)
|
||||
uint64_t timestamp)
|
||||
: nid(nid), depth(depth), split(split), timestamp(timestamp) {}
|
||||
bool IsValid(const TrainParam& param, int num_leaves) const {
|
||||
if (split.loss_chg <= rt_eps) return false;
|
||||
|
||||
Reference in New Issue
Block a user