Added back UpdatePredictionCache() in updater_gpu_hist.cu. (#3120)
* Added back UpdatePredictionCache() in updater_gpu_hist.cu. - it had been there before, but wasn't ported to the new version of updater_gpu_hist.cu
This commit is contained in:
parent
d5f1b74ef5
commit
a1b48afa41
@ -378,9 +378,11 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
if (it != cache_.end()) {
|
||||
HostDeviceVector<bst_float>& y = it->second.predictions;
|
||||
if (y.size() != 0) {
|
||||
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||
out_preds->resize(y.size(), 0.0f, param.gpu_id);
|
||||
thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id),
|
||||
out_preds->tbegin(param.gpu_id));
|
||||
dh::safe_cuda
|
||||
(cudaMemcpy(out_preds->ptr_d(param.gpu_id), y.ptr_d(param.gpu_id),
|
||||
out_preds->size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -406,8 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
|
||||
static_cast<bst_uint>(model.trees.size()));
|
||||
} else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
||||
num_new_trees == 1 &&
|
||||
updaters->back()->UpdatePredictionCache(e.data.get(),
|
||||
&predictions)) {
|
||||
updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
|
||||
// do nothing
|
||||
} else {
|
||||
DevicePredictInternal(dmat, &predictions, model, old_ntree,
|
||||
|
||||
@ -210,6 +210,18 @@ struct DeviceHistogram {
|
||||
}
|
||||
};
|
||||
|
||||
struct CalcWeightTrainParam {
|
||||
float min_child_weight;
|
||||
float reg_alpha;
|
||||
float reg_lambda;
|
||||
float max_delta_step;
|
||||
float learning_rate;
|
||||
__host__ __device__ CalcWeightTrainParam(const TrainParam& p)
|
||||
: min_child_weight(p.min_child_weight), reg_alpha(p.reg_alpha),
|
||||
reg_lambda(p.reg_lambda), max_delta_step(p.max_delta_step),
|
||||
learning_rate(p.learning_rate) {}
|
||||
};
|
||||
|
||||
// Manage memory for a single GPU
|
||||
struct DeviceShard {
|
||||
struct Segment {
|
||||
@ -236,7 +248,9 @@ struct DeviceShard {
|
||||
dh::dvec<float> gidx_fvalue_map;
|
||||
dh::dvec<float> min_fvalue;
|
||||
dh::dvec<int> monotone_constraints;
|
||||
dh::dvec<bst_float> prediction_cache;
|
||||
std::vector<bst_gpair> node_sum_gradients;
|
||||
dh::dvec<bst_gpair> node_sum_gradients_d;
|
||||
common::CompressedIterator<uint32_t> gidx;
|
||||
int row_stride;
|
||||
bst_uint row_begin_idx; // The row offset for this shard
|
||||
@ -246,6 +260,7 @@ struct DeviceShard {
|
||||
int null_gidx_value;
|
||||
DeviceHistogram hist;
|
||||
TrainParam param;
|
||||
bool prediction_cache_initialised;
|
||||
|
||||
int64_t* tmp_pinned; // Small amount of staging memory
|
||||
|
||||
@ -263,7 +278,8 @@ struct DeviceShard {
|
||||
n_rows(row_end - row_begin),
|
||||
n_bins(n_bins),
|
||||
null_gidx_value(n_bins),
|
||||
param(param) {
|
||||
param(param),
|
||||
prediction_cache_initialised(false) {
|
||||
// Convert to ELLPACK matrix representation
|
||||
int max_elements_row = 0;
|
||||
for (auto i = row_begin; i < row_end; i++) {
|
||||
@ -296,6 +312,7 @@ struct DeviceShard {
|
||||
param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
|
||||
ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
|
||||
&gpair, n_rows, &ridx, n_rows, &position, n_rows,
|
||||
&prediction_cache, n_rows, &node_sum_gradients_d, max_nodes,
|
||||
&feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
|
||||
gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(),
|
||||
&monotone_constraints, param.monotone_constraints.size());
|
||||
@ -481,13 +498,46 @@ struct DeviceShard {
|
||||
ridx.current() + segment.begin, ridx.other() + segment.begin,
|
||||
segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
void UpdatePredictionCache(bst_float* out_preds_d) {
|
||||
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||
if (!prediction_cache_initialised) {
|
||||
dh::safe_cuda(cudaMemcpy
|
||||
(prediction_cache.data(), &out_preds_d[row_begin_idx],
|
||||
prediction_cache.size() * sizeof(bst_float),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
prediction_cache_initialised = true;
|
||||
|
||||
CalcWeightTrainParam param_d(param);
|
||||
|
||||
thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||
node_sum_gradients_d.tbegin());
|
||||
auto d_position = position.current();
|
||||
auto d_ridx = ridx.current();
|
||||
auto d_node_sum_gradients = node_sum_gradients_d.data();
|
||||
auto d_prediction_cache = prediction_cache.data();
|
||||
|
||||
dh::launch_n(device_idx, prediction_cache.size(),
|
||||
[=] __device__(int local_idx) {
|
||||
int pos = d_position[local_idx];
|
||||
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
||||
d_prediction_cache[d_ridx[local_idx]] +=
|
||||
weight * param_d.learning_rate;
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaMemcpy
|
||||
(&out_preds_d[row_begin_idx], prediction_cache.data(),
|
||||
prediction_cache.size() * sizeof(bst_float),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
class GPUHistMaker : public TreeUpdater {
|
||||
public:
|
||||
struct ExpandEntry;
|
||||
|
||||
GPUHistMaker() : initialised(false) {}
|
||||
GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {}
|
||||
~GPUHistMaker() {}
|
||||
void Init(
|
||||
const std::vector<std::pair<std::string, std::string>>& args) override {
|
||||
@ -571,6 +621,7 @@ class GPUHistMaker : public TreeUpdater {
|
||||
row_segments[cpu_thread_id + 1], n_bins, param));
|
||||
}
|
||||
|
||||
p_last_fmat_ = dmat;
|
||||
initialised = true;
|
||||
}
|
||||
|
||||
@ -858,6 +909,22 @@ class GPUHistMaker : public TreeUpdater {
|
||||
omp_set_num_threads(nthread);
|
||||
}
|
||||
|
||||
bool UpdatePredictionCache
|
||||
(const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
|
||||
monitor.Start("UpdatePredictionCache", dList);
|
||||
if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||
return false;
|
||||
|
||||
bst_float *out_preds_d = p_out_preds->ptr_d(param.gpu_id);
|
||||
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int shard = 0; shard < shards.size(); ++shard) {
|
||||
shards[shard]->UpdatePredictionCache(out_preds_d);
|
||||
}
|
||||
monitor.Stop("UpdatePredictionCache", dList);
|
||||
return true;
|
||||
}
|
||||
|
||||
struct ExpandEntry {
|
||||
int nid;
|
||||
int depth;
|
||||
@ -925,6 +992,8 @@ class GPUHistMaker : public TreeUpdater {
|
||||
dh::AllReducer reducer;
|
||||
std::vector<ValueConstraint> node_value_constraints_;
|
||||
std::vector<int> dList;
|
||||
|
||||
DMatrix* p_last_fmat_;
|
||||
};
|
||||
|
||||
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user