Added back UpdatePredictionCache() in updater_gpu_hist.cu. (#3120)
* Added back UpdatePredictionCache() in updater_gpu_hist.cu. - it had been there before, but wasn't ported to the new version of updater_gpu_hist.cu
This commit is contained in:
parent
d5f1b74ef5
commit
a1b48afa41
@ -378,9 +378,11 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
if (it != cache_.end()) {
|
if (it != cache_.end()) {
|
||||||
HostDeviceVector<bst_float>& y = it->second.predictions;
|
HostDeviceVector<bst_float>& y = it->second.predictions;
|
||||||
if (y.size() != 0) {
|
if (y.size() != 0) {
|
||||||
|
dh::safe_cuda(cudaSetDevice(param.gpu_id));
|
||||||
out_preds->resize(y.size(), 0.0f, param.gpu_id);
|
out_preds->resize(y.size(), 0.0f, param.gpu_id);
|
||||||
thrust::copy(y.tbegin(param.gpu_id), y.tend(param.gpu_id),
|
dh::safe_cuda
|
||||||
out_preds->tbegin(param.gpu_id));
|
(cudaMemcpy(out_preds->ptr_d(param.gpu_id), y.ptr_d(param.gpu_id),
|
||||||
|
out_preds->size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -406,8 +408,7 @@ class GPUPredictor : public xgboost::Predictor {
|
|||||||
static_cast<bst_uint>(model.trees.size()));
|
static_cast<bst_uint>(model.trees.size()));
|
||||||
} else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
} else if (model.param.num_output_group == 1 && updaters->size() > 0 &&
|
||||||
num_new_trees == 1 &&
|
num_new_trees == 1 &&
|
||||||
updaters->back()->UpdatePredictionCache(e.data.get(),
|
updaters->back()->UpdatePredictionCache(e.data.get(), &predictions)) {
|
||||||
&predictions)) {
|
|
||||||
// do nothing
|
// do nothing
|
||||||
} else {
|
} else {
|
||||||
DevicePredictInternal(dmat, &predictions, model, old_ntree,
|
DevicePredictInternal(dmat, &predictions, model, old_ntree,
|
||||||
|
|||||||
@ -210,6 +210,18 @@ struct DeviceHistogram {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct CalcWeightTrainParam {
|
||||||
|
float min_child_weight;
|
||||||
|
float reg_alpha;
|
||||||
|
float reg_lambda;
|
||||||
|
float max_delta_step;
|
||||||
|
float learning_rate;
|
||||||
|
__host__ __device__ CalcWeightTrainParam(const TrainParam& p)
|
||||||
|
: min_child_weight(p.min_child_weight), reg_alpha(p.reg_alpha),
|
||||||
|
reg_lambda(p.reg_lambda), max_delta_step(p.max_delta_step),
|
||||||
|
learning_rate(p.learning_rate) {}
|
||||||
|
};
|
||||||
|
|
||||||
// Manage memory for a single GPU
|
// Manage memory for a single GPU
|
||||||
struct DeviceShard {
|
struct DeviceShard {
|
||||||
struct Segment {
|
struct Segment {
|
||||||
@ -236,7 +248,9 @@ struct DeviceShard {
|
|||||||
dh::dvec<float> gidx_fvalue_map;
|
dh::dvec<float> gidx_fvalue_map;
|
||||||
dh::dvec<float> min_fvalue;
|
dh::dvec<float> min_fvalue;
|
||||||
dh::dvec<int> monotone_constraints;
|
dh::dvec<int> monotone_constraints;
|
||||||
|
dh::dvec<bst_float> prediction_cache;
|
||||||
std::vector<bst_gpair> node_sum_gradients;
|
std::vector<bst_gpair> node_sum_gradients;
|
||||||
|
dh::dvec<bst_gpair> node_sum_gradients_d;
|
||||||
common::CompressedIterator<uint32_t> gidx;
|
common::CompressedIterator<uint32_t> gidx;
|
||||||
int row_stride;
|
int row_stride;
|
||||||
bst_uint row_begin_idx; // The row offset for this shard
|
bst_uint row_begin_idx; // The row offset for this shard
|
||||||
@ -246,6 +260,7 @@ struct DeviceShard {
|
|||||||
int null_gidx_value;
|
int null_gidx_value;
|
||||||
DeviceHistogram hist;
|
DeviceHistogram hist;
|
||||||
TrainParam param;
|
TrainParam param;
|
||||||
|
bool prediction_cache_initialised;
|
||||||
|
|
||||||
int64_t* tmp_pinned; // Small amount of staging memory
|
int64_t* tmp_pinned; // Small amount of staging memory
|
||||||
|
|
||||||
@ -263,7 +278,8 @@ struct DeviceShard {
|
|||||||
n_rows(row_end - row_begin),
|
n_rows(row_end - row_begin),
|
||||||
n_bins(n_bins),
|
n_bins(n_bins),
|
||||||
null_gidx_value(n_bins),
|
null_gidx_value(n_bins),
|
||||||
param(param) {
|
param(param),
|
||||||
|
prediction_cache_initialised(false) {
|
||||||
// Convert to ELLPACK matrix representation
|
// Convert to ELLPACK matrix representation
|
||||||
int max_elements_row = 0;
|
int max_elements_row = 0;
|
||||||
for (auto i = row_begin; i < row_end; i++) {
|
for (auto i = row_begin; i < row_end; i++) {
|
||||||
@ -296,6 +312,7 @@ struct DeviceShard {
|
|||||||
param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
|
param.max_leaves > 0 ? param.max_leaves * 2 : n_nodes(param.max_depth);
|
||||||
ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
|
ba.allocate(device_idx, param.silent, &gidx_buffer, compressed_size_bytes,
|
||||||
&gpair, n_rows, &ridx, n_rows, &position, n_rows,
|
&gpair, n_rows, &ridx, n_rows, &position, n_rows,
|
||||||
|
&prediction_cache, n_rows, &node_sum_gradients_d, max_nodes,
|
||||||
&feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
|
&feature_segments, gmat.cut->row_ptr.size(), &gidx_fvalue_map,
|
||||||
gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(),
|
gmat.cut->cut.size(), &min_fvalue, gmat.cut->min_val.size(),
|
||||||
&monotone_constraints, param.monotone_constraints.size());
|
&monotone_constraints, param.monotone_constraints.size());
|
||||||
@ -481,13 +498,46 @@ struct DeviceShard {
|
|||||||
ridx.current() + segment.begin, ridx.other() + segment.begin,
|
ridx.current() + segment.begin, ridx.other() + segment.begin,
|
||||||
segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
|
segment.Size() * sizeof(bst_uint), cudaMemcpyDeviceToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void UpdatePredictionCache(bst_float* out_preds_d) {
|
||||||
|
dh::safe_cuda(cudaSetDevice(device_idx));
|
||||||
|
if (!prediction_cache_initialised) {
|
||||||
|
dh::safe_cuda(cudaMemcpy
|
||||||
|
(prediction_cache.data(), &out_preds_d[row_begin_idx],
|
||||||
|
prediction_cache.size() * sizeof(bst_float),
|
||||||
|
cudaMemcpyDefault));
|
||||||
|
}
|
||||||
|
prediction_cache_initialised = true;
|
||||||
|
|
||||||
|
CalcWeightTrainParam param_d(param);
|
||||||
|
|
||||||
|
thrust::copy(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||||
|
node_sum_gradients_d.tbegin());
|
||||||
|
auto d_position = position.current();
|
||||||
|
auto d_ridx = ridx.current();
|
||||||
|
auto d_node_sum_gradients = node_sum_gradients_d.data();
|
||||||
|
auto d_prediction_cache = prediction_cache.data();
|
||||||
|
|
||||||
|
dh::launch_n(device_idx, prediction_cache.size(),
|
||||||
|
[=] __device__(int local_idx) {
|
||||||
|
int pos = d_position[local_idx];
|
||||||
|
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
||||||
|
d_prediction_cache[d_ridx[local_idx]] +=
|
||||||
|
weight * param_d.learning_rate;
|
||||||
|
});
|
||||||
|
|
||||||
|
dh::safe_cuda(cudaMemcpy
|
||||||
|
(&out_preds_d[row_begin_idx], prediction_cache.data(),
|
||||||
|
prediction_cache.size() * sizeof(bst_float),
|
||||||
|
cudaMemcpyDefault));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class GPUHistMaker : public TreeUpdater {
|
class GPUHistMaker : public TreeUpdater {
|
||||||
public:
|
public:
|
||||||
struct ExpandEntry;
|
struct ExpandEntry;
|
||||||
|
|
||||||
GPUHistMaker() : initialised(false) {}
|
GPUHistMaker() : initialised(false), p_last_fmat_(nullptr) {}
|
||||||
~GPUHistMaker() {}
|
~GPUHistMaker() {}
|
||||||
void Init(
|
void Init(
|
||||||
const std::vector<std::pair<std::string, std::string>>& args) override {
|
const std::vector<std::pair<std::string, std::string>>& args) override {
|
||||||
@ -571,6 +621,7 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
row_segments[cpu_thread_id + 1], n_bins, param));
|
row_segments[cpu_thread_id + 1], n_bins, param));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
p_last_fmat_ = dmat;
|
||||||
initialised = true;
|
initialised = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -858,6 +909,22 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
omp_set_num_threads(nthread);
|
omp_set_num_threads(nthread);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool UpdatePredictionCache
|
||||||
|
(const DMatrix* data, HostDeviceVector<bst_float>* p_out_preds) override {
|
||||||
|
monitor.Start("UpdatePredictionCache", dList);
|
||||||
|
if (shards.empty() || p_last_fmat_ == nullptr || p_last_fmat_ != data)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
bst_float *out_preds_d = p_out_preds->ptr_d(param.gpu_id);
|
||||||
|
|
||||||
|
#pragma omp parallel for schedule(static, 1)
|
||||||
|
for (int shard = 0; shard < shards.size(); ++shard) {
|
||||||
|
shards[shard]->UpdatePredictionCache(out_preds_d);
|
||||||
|
}
|
||||||
|
monitor.Stop("UpdatePredictionCache", dList);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
struct ExpandEntry {
|
struct ExpandEntry {
|
||||||
int nid;
|
int nid;
|
||||||
int depth;
|
int depth;
|
||||||
@ -925,6 +992,8 @@ class GPUHistMaker : public TreeUpdater {
|
|||||||
dh::AllReducer reducer;
|
dh::AllReducer reducer;
|
||||||
std::vector<ValueConstraint> node_value_constraints_;
|
std::vector<ValueConstraint> node_value_constraints_;
|
||||||
std::vector<int> dList;
|
std::vector<int> dList;
|
||||||
|
|
||||||
|
DMatrix* p_last_fmat_;
|
||||||
};
|
};
|
||||||
|
|
||||||
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user