diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index ca00e4b14..255888324 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -378,9 +378,11 @@ class GPUPredictor : public xgboost::Predictor { if (it != cache_.end()) { HostDeviceVector& y = it->second.predictions; if (y.size() != 0) { + dh::safe_cuda(cudaSetDevice(param.gpu_id)); out_preds->resize(y.size(), 0.0f, param.gpu_id); - thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id), - out_preds->tbegin(param.gpu_id)); + dh::safe_cuda + (cudaMemcpy(out_preds->ptr_d(param.gpu_id), y.ptr_d(param.gpu_id), + out_preds->size() * sizeof(bst_float), cudaMemcpyDefault)); return true; } } @@ -406,8 +408,7 @@ class GPUPredictor : public xgboost::Predictor { static_cast(model.trees.size())); } else if (model.param.num_output_group == 1 && updaters->size() > 0 && num_new_trees == 1 && - updaters->back()->UpdatePredictionCache(e.data.get(), - &predictions)) { + updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) { // do nothing } else { DevicePredictInternal(dmat, &predictions, model, old_ntree, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 48bd45f09..b951b8a9e 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -210,6 +210,18 @@ struct DeviceHistogram { } }; +struct CalcWeightTrainParam { + float min_child_weight; + float reg_alpha; + float reg_lambda; + float max_delta_step; + float learning_rate; + __host__ __device__ CalcWeightTrainParam(const TrainParam& p) + : min_child_weight(p.min_child_weight), reg_alpha(p.reg_alpha), + reg_lambda(p.reg_lambda), max_delta_step(p.max_delta_step), + learning_rate(p.learning_rate) {} +}; + // Manage memory for a single GPU struct DeviceShard { struct Segment { @@ -236,7 +248,9 @@ struct DeviceShard { dh::dvec gidx_fvalue_map; dh::dvec min_fvalue; dh::dvec monotone_constraints; + dh::dvec prediction_cache; std::vector node_sum_gradients; + dh::dvec node_sum_gradients_d; common::CompressedIterator gidx; int row_stride; bst_uint row_begin_idx; // The row offset for this shard @@ -246,6 +260,7 @@ struct DeviceShard { int null_gidx_value; DeviceHistogram hist; TrainParam param; + bool prediction_cache_initialised; int64_t* tmp_pinned; // Small amount of staging memory @@ -263,7 +278,8 @@ struct DeviceShard { n_rows(row_end - row_begin), n_bins(n_bins), null_gidx_value(n_bins), - param(param) { + param(param), + prediction_cache_initialised(false) { // Convert to ELLPACK matrix representation int max_elements_row = 0; for (auto i = row_begin; i < row_end; i++) { @@ -296,6 +312,7 @@ struct DeviceShard { param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth); ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes, &gpair, n_rows, &ridx, n_rows, &position, n_rows, + &prediction_cache, n_rows, &node_sum_gradients_d, max_nodes, &feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map, gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(), &monotone_constraints, param.monotone_constraints.size()); @@ -481,13 +498,46 @@ struct DeviceShard { ridx.current() + segment.begin, ridx.other() + segment.begin, segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice)); } + + void UpdatePredictionCache(bst_float* out_preds_d) { + dh::safe_cuda(cudaSetDevice(device_idx)); + if (!prediction_cache_initialised) { + dh::safe_cuda(cudaMemcpy + (prediction_cache.data(), &out_preds_d[row_begin_idx], + prediction_cache.size() * sizeof(bst_float), + cudaMemcpyDefault)); + } + prediction_cache_initialised = true; + + CalcWeightTrainParam param_d(param); + + thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(), + node_sum_gradients_d.tbegin()); + auto d_position = position.current(); + auto d_ridx = ridx.current(); + auto d_node_sum_gradients = node_sum_gradients_d.data(); + auto d_prediction_cache = prediction_cache.data(); + + dh::launch_n(device_idx, prediction_cache.size(), + [=] __device__(int local_idx) { + int pos = d_position[local_idx]; + bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]); + d_prediction_cache[d_ridx[local_idx]] += + weight * param_d.learning_rate; + }); + + dh::safe_cuda(cudaMemcpy + (&out_preds_d[row_begin_idx], prediction_cache.data(), + prediction_cache.size() * sizeof(bst_float), + cudaMemcpyDefault)); + } }; class GPUHistMaker : public TreeUpdater { public: struct ExpandEntry; - GPUHistMaker() : initialised(false) {} + GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {} ~GPUHistMaker() {} void Init( const std::vector>& args) override { @@ -571,6 +621,7 @@ class GPUHistMaker : public TreeUpdater { row_segments[cpu_thread_id + 1], n_bins, param)); } + p_last_fmat_ = dmat; initialised = true; } @@ -858,6 +909,22 @@ class GPUHistMaker : public TreeUpdater { omp_set_num_threads(nthread); } + bool UpdatePredictionCache + (const DMatrix* data, HostDeviceVector* p_out_preds) override { + monitor.Start("UpdatePredictionCache", dList); + if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data) + return false; + + bst_float *out_preds_d = p_out_preds->ptr_d(param.gpu_id); + + #pragma omp parallel for schedule(static, 1) + for (int shard = 0; shard < shards.size(); ++shard) { + shards[shard]->UpdatePredictionCache(out_preds_d); + } + monitor.Stop("UpdatePredictionCache", dList); + return true; + } + struct ExpandEntry { int nid; int depth; @@ -925,6 +992,8 @@ class GPUHistMaker : public TreeUpdater { dh::AllReducer reducer; std::vector node_value_constraints_; std::vector dList; + + DMatrix* p_last_fmat_; }; XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")