Added back UpdatePredictionCache() in updater_gpu_hist.cu. (#3120)

* Added back UpdatePredictionCache() in updater_gpu_hist.cu.

- it had been there before, but wasn't ported to the new version
  of updater_gpu_hist.cu
This commit is contained in:
Andrew V. Adinetz 2018-03-09 03:06:45 +01:00 committed by Rory Mitchell
parent d5f1b74ef5
commit a1b48afa41
2 changed files with 76 additions and 6 deletions

View File

@ -378,9 +378,11 @@ class GPUPredictor : public xgboost::Predictor {
if (it != cache_.end()) { if (it != cache_.end()) {
HostDeviceVector<bst_float>& y = it->second.predictions; HostDeviceVector<bst_float>& y = it->second.predictions;
if (y.size() != 0) { if (y.size() != 0) {
dh::safe_cuda(cudaSetDevice(param.gpu_id));
out_preds->resize(y.size(), 0.0f, param.gpu_id); out_preds->resize(y.size(), 0.0f, param.gpu_id);
thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id), dh::safe_cuda
out_preds->tbegin(param.gpu_id)); (cudaMemcpy(out_preds->ptr_d(param.gpu_id), y.ptr_d(param.gpu_id),
out_preds->size() * sizeof(bst_float), cudaMemcpyDefault));
return true; return true;
} }
} }
@ -406,8 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
static_cast<bst_uint>(model.trees.size())); static_cast<bst_uint>(model.trees.size()));
} else if (model.param.num_output_group == 1 && updaters->size() > 0 && } else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
num_new_trees == 1 && num_new_trees == 1 &&
updaters->back()->UpdatePredictionCache(e.data.get(), updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
&predictions)) {
// do nothing // do nothing
} else { } else {
DevicePredictInternal(dmat, &predictions, model, old_ntree, DevicePredictInternal(dmat, &predictions, model, old_ntree,

View File

@ -210,6 +210,18 @@ struct DeviceHistogram {
} }
}; };
struct CalcWeightTrainParam {
float min_child_weight;
float reg_alpha;
float reg_lambda;
float max_delta_step;
float learning_rate;
__host__ __device__ CalcWeightTrainParam(const TrainParam& p)
: min_child_weight(p.min_child_weight), reg_alpha(p.reg_alpha),
reg_lambda(p.reg_lambda), max_delta_step(p.max_delta_step),
learning_rate(p.learning_rate) {}
};
// Manage memory for a single GPU // Manage memory for a single GPU
struct DeviceShard { struct DeviceShard {
struct Segment { struct Segment {
@ -236,7 +248,9 @@ struct DeviceShard {
dh::dvec<float> gidx_fvalue_map; dh::dvec<float> gidx_fvalue_map;
dh::dvec<float> min_fvalue; dh::dvec<float> min_fvalue;
dh::dvec<int> monotone_constraints; dh::dvec<int> monotone_constraints;
dh::dvec<bst_float> prediction_cache;
std::vector<bst_gpair> node_sum_gradients; std::vector<bst_gpair> node_sum_gradients;
dh::dvec<bst_gpair> node_sum_gradients_d;
common::CompressedIterator<uint32_t> gidx; common::CompressedIterator<uint32_t> gidx;
int row_stride; int row_stride;
bst_uint row_begin_idx; // The row offset for this shard bst_uint row_begin_idx; // The row offset for this shard
@ -246,6 +260,7 @@ struct DeviceShard {
int null_gidx_value; int null_gidx_value;
DeviceHistogram hist; DeviceHistogram hist;
TrainParam param; TrainParam param;
bool prediction_cache_initialised;
int64_t* tmp_pinned; // Small amount of staging memory int64_t* tmp_pinned; // Small amount of staging memory
@ -263,7 +278,8 @@ struct DeviceShard {
n_rows(row_end - row_begin), n_rows(row_end - row_begin),
n_bins(n_bins), n_bins(n_bins),
null_gidx_value(n_bins), null_gidx_value(n_bins),
param(param) { param(param),
prediction_cache_initialised(false) {
// Convert to ELLPACK matrix representation // Convert to ELLPACK matrix representation
int max_elements_row = 0; int max_elements_row = 0;
for (auto i = row_begin; i < row_end; i++) { for (auto i = row_begin; i < row_end; i++) {
@ -296,6 +312,7 @@ struct DeviceShard {
param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth); param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes, ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
&gpair, n_rows, &ridx, n_rows, &position, n_rows, &gpair, n_rows, &ridx, n_rows, &position, n_rows,
&prediction_cache, n_rows, &node_sum_gradients_d, max_nodes,
&feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map, &feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(), gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(),
&monotone_constraints, param.monotone_constraints.size()); &monotone_constraints, param.monotone_constraints.size());
@ -481,13 +498,46 @@ struct DeviceShard {
ridx.current() + segment.begin, ridx.other() + segment.begin, ridx.current() + segment.begin, ridx.other() + segment.begin,
segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice)); segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
} }
void UpdatePredictionCache(bst_float* out_preds_d) {
dh::safe_cuda(cudaSetDevice(device_idx));
if (!prediction_cache_initialised) {
dh::safe_cuda(cudaMemcpy
(prediction_cache.data(), &out_preds_d[row_begin_idx],
prediction_cache.size() * sizeof(bst_float),
cudaMemcpyDefault));
}
prediction_cache_initialised = true;
CalcWeightTrainParam param_d(param);
thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(),
node_sum_gradients_d.tbegin());
auto d_position = position.current();
auto d_ridx = ridx.current();
auto d_node_sum_gradients = node_sum_gradients_d.data();
auto d_prediction_cache = prediction_cache.data();
dh::launch_n(device_idx, prediction_cache.size(),
[=] __device__(int local_idx) {
int pos = d_position[local_idx];
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
d_prediction_cache[d_ridx[local_idx]] +=
weight * param_d.learning_rate;
});
dh::safe_cuda(cudaMemcpy
(&out_preds_d[row_begin_idx], prediction_cache.data(),
prediction_cache.size() * sizeof(bst_float),
cudaMemcpyDefault));
}
}; };
class GPUHistMaker : public TreeUpdater { class GPUHistMaker : public TreeUpdater {
public: public:
struct ExpandEntry; struct ExpandEntry;
GPUHistMaker() : initialised(false) {} GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {}
~GPUHistMaker() {} ~GPUHistMaker() {}
void Init( void Init(
const std::vector<std::pair<std::string, std::string>>& args) override { const std::vector<std::pair<std::string, std::string>>& args) override {
@ -571,6 +621,7 @@ class GPUHistMaker : public TreeUpdater {
row_segments[cpu_thread_id + 1], n_bins, param)); row_segments[cpu_thread_id + 1], n_bins, param));
} }
p_last_fmat_ = dmat;
initialised = true; initialised = true;
} }
@ -858,6 +909,22 @@ class GPUHistMaker : public TreeUpdater {
omp_set_num_threads(nthread); omp_set_num_threads(nthread);
} }
bool UpdatePredictionCache
(const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
monitor.Start("UpdatePredictionCache", dList);
if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
return false;
bst_float *out_preds_d = p_out_preds->ptr_d(param.gpu_id);
#pragma omp parallel for schedule(static, 1)
for (int shard = 0; shard < shards.size(); ++shard) {
shards[shard]->UpdatePredictionCache(out_preds_d);
}
monitor.Stop("UpdatePredictionCache", dList);
return true;
}
struct ExpandEntry { struct ExpandEntry {
int nid; int nid;
int depth; int depth;
@ -925,6 +992,8 @@ class GPUHistMaker : public TreeUpdater {
dh::AllReducer reducer; dh::AllReducer reducer;
std::vector<ValueConstraint> node_value_constraints_; std::vector<ValueConstraint> node_value_constraints_;
std::vector<int> dList; std::vector<int> dList;
DMatrix* p_last_fmat_;
}; };
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist") XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")