Further optimisations for gpu_hist. (#4283)

- Fuse final update position functions into a single more efficient kernel

- Refactor gpu_hist with a more explicit ellpack  matrix representation
This commit is contained in:
Rory Mitchell
2019-03-24 17:17:22 +13:00
committed by GitHub
parent 5aa42b5f11
commit 6d5b34d824
5 changed files with 345 additions and 297 deletions

View File

@@ -303,6 +303,7 @@ class GPUPredictor : public xgboost::Predictor {
const gbm::GBTreeModel& model, size_t tree_begin,
size_t tree_end) {
if (tree_end - tree_begin == 0) { return; }
monitor_.StartCuda("DevicePredictInternal");
CHECK_EQ(model.param.size_leaf_vector, 0);
// Copy decision trees to device
@@ -337,6 +338,7 @@ class GPUPredictor : public xgboost::Predictor {
});
i_batch++;
}
monitor_.StopCuda("DevicePredictInternal");
}
public:
@@ -388,9 +390,11 @@ class GPUPredictor : public xgboost::Predictor {
if (it != cache_.end()) {
const HostDeviceVector<bst_float>& y = it->second.predictions;
if (y.Size() != 0) {
monitor_.StartCuda("PredictFromCache");
out_preds->Reshard(y.Distribution());
out_preds->Resize(y.Size());
out_preds->Copy(y);
monitor_.StopCuda("PredictFromCache");
return true;
}
}
@@ -481,6 +485,7 @@ class GPUPredictor : public xgboost::Predictor {
std::unique_ptr<Predictor> cpu_predictor_;
std::vector<DeviceShard> shards_;
GPUSet devices_;
common::Monitor monitor_;
};
XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")