Retire DVec class in favour of c++20 style span for device memory. (#4293)
This commit is contained in:
@@ -545,21 +545,21 @@ class GPUMaker : public TreeUpdater {
|
||||
/** whether we have initialized memory already (so as not to repeat!) */
|
||||
bool allocated_;
|
||||
/** feature values stored in column-major compressed format */
|
||||
dh::DVec2<float> vals_;
|
||||
dh::DVec<float> vals_cached_;
|
||||
dh::DoubleBuffer<float> vals_;
|
||||
common::Span<float> vals_cached_;
|
||||
/** corresponding instance id's of these featutre values */
|
||||
dh::DVec2<int> instIds_;
|
||||
dh::DVec<int> inst_ids_cached_;
|
||||
dh::DoubleBuffer<int> instIds_;
|
||||
common::Span<int> inst_ids_cached_;
|
||||
/** column offsets for these feature values */
|
||||
dh::DVec<int> colOffsets_;
|
||||
dh::DVec<GradientPair> gradsInst_;
|
||||
dh::DVec2<NodeIdT> nodeAssigns_;
|
||||
dh::DVec2<int> nodeLocations_;
|
||||
dh::DVec<DeviceNodeStats> nodes_;
|
||||
dh::DVec<NodeIdT> node_assigns_per_inst_;
|
||||
dh::DVec<GradientPair> gradsums_;
|
||||
dh::DVec<GradientPair> gradscans_;
|
||||
dh::DVec<ExactSplitCandidate> nodeSplits_;
|
||||
common::Span<int> colOffsets_;
|
||||
common::Span<GradientPair> gradsInst_;
|
||||
dh::DoubleBuffer<NodeIdT> nodeAssigns_;
|
||||
dh::DoubleBuffer<int> nodeLocations_;
|
||||
common::Span<DeviceNodeStats> nodes_;
|
||||
common::Span<NodeIdT> node_assigns_per_inst_;
|
||||
common::Span<GradientPair> gradsums_;
|
||||
common::Span<GradientPair> gradscans_;
|
||||
common::Span<ExactSplitCandidate> nodeSplits_;
|
||||
int n_vals_;
|
||||
int n_rows_;
|
||||
int n_cols_;
|
||||
@@ -571,10 +571,10 @@ class GPUMaker : public TreeUpdater {
|
||||
GPUSet devices_;
|
||||
|
||||
dh::CubMemory tmp_mem_;
|
||||
dh::DVec<GradientPair> tmpScanGradBuff_;
|
||||
dh::DVec<int> tmp_scan_key_buff_;
|
||||
dh::DVec<int> colIds_;
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba_;
|
||||
common::Span<GradientPair> tmpScanGradBuff_;
|
||||
common::Span<int> tmp_scan_key_buff_;
|
||||
common::Span<int> colIds_;
|
||||
dh::BulkAllocator ba_;
|
||||
|
||||
public:
|
||||
GPUMaker() : allocated_{false} {}
|
||||
@@ -615,8 +615,8 @@ class GPUMaker : public TreeUpdater {
|
||||
for (int i = 0; i < param_.max_depth; ++i) {
|
||||
if (i == 0) {
|
||||
// make sure to start on a fresh tree with sorted values!
|
||||
vals_.CurrentDVec() = vals_cached_;
|
||||
instIds_.CurrentDVec() = inst_ids_cached_;
|
||||
dh::CopyDeviceSpan(vals_.CurrentSpan(), vals_cached_);
|
||||
dh::CopyDeviceSpan(instIds_.CurrentSpan(), inst_ids_cached_);
|
||||
TransferGrads(gpair);
|
||||
}
|
||||
int nNodes = 1 << i;
|
||||
@@ -630,13 +630,13 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void Split2Node(int nNodes, NodeIdT nodeStart) {
|
||||
auto d_nodes = nodes_.GetSpan();
|
||||
auto d_gradScans = gradscans_.GetSpan();
|
||||
auto d_gradsums = gradsums_.GetSpan();
|
||||
auto d_nodes = nodes_;
|
||||
auto d_gradScans = gradscans_;
|
||||
auto d_gradsums = gradsums_;
|
||||
auto d_nodeAssigns = nodeAssigns_.CurrentSpan();
|
||||
auto d_colIds = colIds_.GetSpan();
|
||||
auto d_colIds = colIds_;
|
||||
auto d_vals = vals_.Current();
|
||||
auto d_nodeSplits = nodeSplits_.Data();
|
||||
auto d_nodeSplits = nodeSplits_.data();
|
||||
int nUniqKeys = nNodes;
|
||||
float min_split_loss = param_.min_split_loss;
|
||||
auto gpu_param = GPUTrainingParam(param_);
|
||||
@@ -679,13 +679,13 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void FindSplit(int level, NodeIdT nodeStart, int nNodes) {
|
||||
ReduceScanByKey(gradsums_.GetSpan(), gradscans_.GetSpan(), gradsInst_.GetSpan(),
|
||||
ReduceScanByKey(gradsums_, gradscans_, gradsInst_,
|
||||
instIds_.CurrentSpan(), nodeAssigns_.CurrentSpan(), n_vals_, nNodes,
|
||||
n_cols_, tmpScanGradBuff_.GetSpan(), tmp_scan_key_buff_.GetSpan(),
|
||||
colIds_.GetSpan(), nodeStart);
|
||||
ArgMaxByKey(nodeSplits_.GetSpan(), gradscans_.GetSpan(), gradsums_.GetSpan(),
|
||||
vals_.CurrentSpan(), colIds_.GetSpan(), nodeAssigns_.CurrentSpan(),
|
||||
nodes_.GetSpan(), nNodes, nodeStart, n_vals_, param_,
|
||||
n_cols_, tmpScanGradBuff_, tmp_scan_key_buff_,
|
||||
colIds_, nodeStart);
|
||||
ArgMaxByKey(nodeSplits_, gradscans_, gradsums_,
|
||||
vals_.CurrentSpan(), colIds_, nodeAssigns_.CurrentSpan(),
|
||||
nodes_, nNodes, nodeStart, n_vals_, param_,
|
||||
level <= kMaxAbkLevels ? kAbkSmem : kAbkGmem);
|
||||
Split2Node(nNodes, nodeStart);
|
||||
}
|
||||
@@ -707,7 +707,7 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
std::vector<float> fval;
|
||||
std::vector<int> fId;
|
||||
std::vector<size_t> offset;
|
||||
std::vector<int> offset;
|
||||
ConvertToCsc(dmat, &fval, &fId, &offset);
|
||||
AllocateAllData(static_cast<int>(offset.size()));
|
||||
TransferAndSortData(fval, fId, offset);
|
||||
@@ -715,7 +715,7 @@ class GPUMaker : public TreeUpdater {
|
||||
}
|
||||
|
||||
void ConvertToCsc(DMatrix* dmat, std::vector<float>* fval,
|
||||
std::vector<int>* fId, std::vector<size_t>* offset) {
|
||||
std::vector<int>* fId, std::vector<int>* offset) {
|
||||
const MetaInfo& info = dmat->Info();
|
||||
CHECK(info.num_col_ < std::numeric_limits<int>::max());
|
||||
CHECK(info.num_row_ < std::numeric_limits<int>::max());
|
||||
@@ -735,7 +735,7 @@ class GPUMaker : public TreeUpdater {
|
||||
fval->push_back(e.fvalue);
|
||||
fId->push_back(inst_id);
|
||||
}
|
||||
offset->push_back(fval->size());
|
||||
offset->push_back(static_cast<int>(fval->size()));
|
||||
}
|
||||
}
|
||||
CHECK(fval->size() < std::numeric_limits<int>::max());
|
||||
@@ -744,19 +744,21 @@ class GPUMaker : public TreeUpdater {
|
||||
|
||||
void TransferAndSortData(const std::vector<float>& fval,
|
||||
const std::vector<int>& fId,
|
||||
const std::vector<size_t>& offset) {
|
||||
vals_.CurrentDVec() = fval;
|
||||
instIds_.CurrentDVec() = fId;
|
||||
colOffsets_ = offset;
|
||||
const std::vector<int>& offset) {
|
||||
dh::CopyVectorToDeviceSpan(vals_.CurrentSpan(), fval);
|
||||
dh::CopyVectorToDeviceSpan(instIds_.CurrentSpan(), fId);
|
||||
dh::CopyVectorToDeviceSpan(colOffsets_, offset);
|
||||
dh::SegmentedSort<float, int>(&tmp_mem_, &vals_, &instIds_, n_vals_, n_cols_,
|
||||
colOffsets_);
|
||||
vals_cached_ = vals_.CurrentDVec();
|
||||
inst_ids_cached_ = instIds_.CurrentDVec();
|
||||
AssignColIds<<<n_cols_, 512>>>(colIds_.Data(), colOffsets_.Data());
|
||||
dh::CopyDeviceSpan(vals_cached_, vals_.CurrentSpan());
|
||||
dh::CopyDeviceSpan(inst_ids_cached_, instIds_.CurrentSpan());
|
||||
AssignColIds<<<n_cols_, 512>>>(colIds_.data(), colOffsets_.data());
|
||||
}
|
||||
|
||||
void TransferGrads(HostDeviceVector<GradientPair>* gpair) {
|
||||
gpair->GatherTo(gradsInst_.tbegin(), gradsInst_.tend());
|
||||
gpair->GatherTo(
|
||||
thrust::device_pointer_cast(gradsInst_.data()),
|
||||
thrust::device_pointer_cast(gradsInst_.data() + gradsInst_.size()));
|
||||
// evaluate the full-grad reduction for the root node
|
||||
dh::SumReduction<GradientPair>(tmp_mem_, gradsInst_, gradsums_, n_rows_);
|
||||
}
|
||||
@@ -764,14 +766,22 @@ class GPUMaker : public TreeUpdater {
|
||||
void InitNodeData(int level, NodeIdT nodeStart, int nNodes) {
|
||||
// all instances belong to root node at the beginning!
|
||||
if (level == 0) {
|
||||
nodes_.Fill(DeviceNodeStats());
|
||||
nodeAssigns_.CurrentDVec().Fill(0);
|
||||
node_assigns_per_inst_.Fill(0);
|
||||
thrust::fill(thrust::device_pointer_cast(nodes_.data()),
|
||||
thrust::device_pointer_cast(nodes_.data() + nodes_.size()),
|
||||
DeviceNodeStats());
|
||||
thrust::fill(thrust::device_pointer_cast(nodeAssigns_.Current()),
|
||||
thrust::device_pointer_cast(nodeAssigns_.Current() +
|
||||
nodeAssigns_.Size()),
|
||||
0);
|
||||
thrust::fill(thrust::device_pointer_cast(node_assigns_per_inst_.data()),
|
||||
thrust::device_pointer_cast(node_assigns_per_inst_.data() +
|
||||
node_assigns_per_inst_.size()),
|
||||
0);
|
||||
// for root node, just update the gradient/score/weight/id info
|
||||
// before splitting it! Currently all data is on GPU, hence this
|
||||
// stupid little kernel
|
||||
auto d_nodes = nodes_.Data();
|
||||
auto d_sums = gradsums_.Data();
|
||||
auto d_nodes = nodes_;
|
||||
auto d_sums = gradsums_;
|
||||
auto gpu_params = GPUTrainingParam(param_);
|
||||
dh::LaunchN(param_.gpu_id, 1, [=] __device__(int idx) {
|
||||
d_nodes[0] = DeviceNodeStats(d_sums[0], 0, gpu_params);
|
||||
@@ -781,17 +791,17 @@ class GPUMaker : public TreeUpdater {
|
||||
const int ItemsPerThread = 4;
|
||||
// assign default node ids first
|
||||
int nBlks = dh::DivRoundUp(n_rows_, BlkDim);
|
||||
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.Data(),
|
||||
nodes_.Data(), n_rows_);
|
||||
FillDefaultNodeIds<<<nBlks, BlkDim>>>(node_assigns_per_inst_.data(),
|
||||
nodes_.data(), n_rows_);
|
||||
// evaluate the correct child indices of non-missing values next
|
||||
nBlks = dh::DivRoundUp(n_vals_, BlkDim * ItemsPerThread);
|
||||
AssignNodeIds<<<nBlks, BlkDim>>>(
|
||||
node_assigns_per_inst_.Data(), nodeLocations_.Current(),
|
||||
nodeAssigns_.Current(), instIds_.Current(), nodes_.Data(),
|
||||
colOffsets_.Data(), vals_.Current(), n_vals_, n_cols_);
|
||||
node_assigns_per_inst_.data(), nodeLocations_.Current(),
|
||||
nodeAssigns_.Current(), instIds_.Current(), nodes_.data(),
|
||||
colOffsets_.data(), vals_.Current(), n_vals_, n_cols_);
|
||||
// gather the node assignments across all other columns too
|
||||
dh::Gather(param_.gpu_id, nodeAssigns_.Current(),
|
||||
node_assigns_per_inst_.Data(), instIds_.Current(), n_vals_);
|
||||
node_assigns_per_inst_.data(), instIds_.Current(), n_vals_);
|
||||
SortKeys(level);
|
||||
}
|
||||
}
|
||||
@@ -804,14 +814,14 @@ class GPUMaker : public TreeUpdater {
|
||||
dh::Gather<float, int>(param_.gpu_id, vals_.other(),
|
||||
vals_.Current(), instIds_.other(), instIds_.Current(),
|
||||
nodeLocations_.Current(), n_vals_);
|
||||
vals_.buff().selector ^= 1;
|
||||
instIds_.buff().selector ^= 1;
|
||||
vals_.buff.selector ^= 1;
|
||||
instIds_.buff.selector ^= 1;
|
||||
}
|
||||
|
||||
void MarkLeaves() {
|
||||
const int BlkDim = 128;
|
||||
int nBlks = dh::DivRoundUp(maxNodes_, BlkDim);
|
||||
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.Data(), maxNodes_);
|
||||
MarkLeavesKernel<<<nBlks, BlkDim>>>(nodes_.data(), maxNodes_);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -254,10 +254,13 @@ XGBOOST_DEVICE inline bool IsLeftChild(int nidx) {
|
||||
|
||||
// Copy gpu dense representation of tree to xgboost sparse representation
|
||||
inline void Dense2SparseTree(RegTree* p_tree,
|
||||
const dh::DVec<DeviceNodeStats>& nodes,
|
||||
common::Span<DeviceNodeStats> nodes,
|
||||
const TrainParam& param) {
|
||||
RegTree& tree = *p_tree;
|
||||
std::vector<DeviceNodeStats> h_nodes = nodes.AsVector();
|
||||
std::vector<DeviceNodeStats> h_nodes(nodes.size());
|
||||
dh::safe_cuda(cudaMemcpy(h_nodes.data(), nodes.data(),
|
||||
nodes.size() * sizeof(DeviceNodeStats),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
int nid = 0;
|
||||
for (int gpu_nid = 0; gpu_nid < h_nodes.size(); gpu_nid++) {
|
||||
@@ -298,18 +301,16 @@ struct BernoulliRng {
|
||||
};
|
||||
|
||||
// Set gradient pair to 0 with p = 1 - subsample
|
||||
inline void SubsampleGradientPair(dh::DVec<GradientPair>* p_gpair, float subsample,
|
||||
int offset = 0) {
|
||||
inline void SubsampleGradientPair(int device_idx,
|
||||
common::Span<GradientPair> d_gpair,
|
||||
float subsample, int offset = 0) {
|
||||
if (subsample == 1.0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dh::DVec<GradientPair>& gpair = *p_gpair;
|
||||
|
||||
auto d_gpair = gpair.Data();
|
||||
BernoulliRng rng(subsample, common::GlobalRandom()());
|
||||
|
||||
dh::LaunchN(gpair.DeviceIdx(), gpair.Size(), [=] XGBOOST_DEVICE(int i) {
|
||||
dh::LaunchN(device_idx, d_gpair.size(), [=] XGBOOST_DEVICE(int i) {
|
||||
if (!rng(i + offset)) {
|
||||
d_gpair[i] = GradientPair();
|
||||
}
|
||||
|
||||
@@ -601,7 +601,7 @@ struct DeviceShard {
|
||||
int n_bins;
|
||||
int device_id;
|
||||
|
||||
dh::BulkAllocator<dh::MemoryType::kDevice> ba;
|
||||
dh::BulkAllocator ba;
|
||||
|
||||
ELLPackMatrix ellpack_matrix;
|
||||
|
||||
@@ -610,27 +610,26 @@ struct DeviceShard {
|
||||
DeviceHistogram<GradientSumT> hist;
|
||||
|
||||
/*! \brief row_ptr form HistCutMatrix. */
|
||||
dh::DVec<uint32_t> feature_segments;
|
||||
common::Span<uint32_t> feature_segments;
|
||||
/*! \brief minimum value for each feature. */
|
||||
dh::DVec<bst_float> min_fvalue;
|
||||
common::Span<bst_float> min_fvalue;
|
||||
/*! \brief Cut. */
|
||||
dh::DVec<bst_float> gidx_fvalue_map;
|
||||
common::Span<bst_float> gidx_fvalue_map;
|
||||
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
||||
dh::DVec<common::CompressedByteT> gidx_buffer;
|
||||
common::Span<common::CompressedByteT> gidx_buffer;
|
||||
|
||||
/*! \brief Row indices relative to this shard, necessary for sorting rows. */
|
||||
dh::DVec2<bst_uint> ridx;
|
||||
dh::DoubleBuffer<bst_uint> ridx;
|
||||
dh::DoubleBuffer<int> position;
|
||||
/*! \brief Gradient pair for each row. */
|
||||
dh::DVec<GradientPair> gpair;
|
||||
common::Span<GradientPair> gpair;
|
||||
|
||||
dh::DVec2<int> position;
|
||||
|
||||
dh::DVec<int> monotone_constraints;
|
||||
dh::DVec<bst_float> prediction_cache;
|
||||
common::Span<int> monotone_constraints;
|
||||
common::Span<bst_float> prediction_cache;
|
||||
|
||||
/*! \brief Sum gradient for each node. */
|
||||
std::vector<GradientPair> node_sum_gradients;
|
||||
dh::DVec<GradientPair> node_sum_gradients_d;
|
||||
common::Span<GradientPair> node_sum_gradients_d;
|
||||
/*! \brief row offset in SparsePage (the input data). */
|
||||
thrust::device_vector<size_t> row_ptrs;
|
||||
/*! \brief On-device feature set, only actually used on one of the devices */
|
||||
@@ -718,7 +717,9 @@ struct DeviceShard {
|
||||
// Reset values for each update iteration
|
||||
void Reset(HostDeviceVector<GradientPair>* dh_gpair) {
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
position.CurrentDVec().Fill(0);
|
||||
thrust::fill(
|
||||
thrust::device_pointer_cast(position.Current()),
|
||||
thrust::device_pointer_cast(position.Current() + position.Size()), 0);
|
||||
std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
|
||||
GradientPair());
|
||||
if (left_counts.size() < 256) {
|
||||
@@ -727,13 +728,16 @@ struct DeviceShard {
|
||||
dh::safe_cuda(cudaMemsetAsync(left_counts.data().get(), 0,
|
||||
sizeof(int64_t) * left_counts.size()));
|
||||
}
|
||||
thrust::sequence(ridx.CurrentDVec().tbegin(), ridx.CurrentDVec().tend());
|
||||
thrust::sequence(
|
||||
thrust::device_pointer_cast(ridx.CurrentSpan().data()),
|
||||
thrust::device_pointer_cast(ridx.CurrentSpan().data() + ridx.Size()));
|
||||
|
||||
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
|
||||
ridx_segments.front() = Segment(0, ridx.Size());
|
||||
this->gpair.copy(dh_gpair->tcbegin(device_id),
|
||||
dh_gpair->tcend(device_id));
|
||||
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
|
||||
dh::safe_cuda(cudaMemcpyAsync(
|
||||
gpair.data(), dh_gpair->ConstDevicePointer(device_id),
|
||||
gpair.size() * sizeof(GradientPair), cudaMemcpyHostToHost));
|
||||
SubsampleGradientPair(device_id, gpair, param.subsample, row_begin_idx);
|
||||
hist.Reset();
|
||||
}
|
||||
|
||||
@@ -788,7 +792,7 @@ struct DeviceShard {
|
||||
<<<uint32_t(d_feature_set.size()), kBlockThreads, 0, streams[i]>>>(
|
||||
hist.GetNodeHistogram(nidx), d_feature_set, node, ellpack_matrix,
|
||||
gpu_param, d_split_candidates, value_constraints[nidx],
|
||||
monotone_constraints.GetSpan());
|
||||
monotone_constraints);
|
||||
|
||||
// Reduce over features to find best feature
|
||||
auto d_result = d_result_all.subspan(i, 1);
|
||||
@@ -943,8 +947,8 @@ struct DeviceShard {
|
||||
void UpdatePredictionCache(bst_float* out_preds_d) {
|
||||
dh::safe_cuda(cudaSetDevice(device_id));
|
||||
if (!prediction_cache_initialised) {
|
||||
dh::safe_cuda(cudaMemcpyAsync(prediction_cache.Data(), out_preds_d,
|
||||
prediction_cache.Size() * sizeof(bst_float),
|
||||
dh::safe_cuda(cudaMemcpyAsync(prediction_cache.data(), out_preds_d,
|
||||
prediction_cache.size() * sizeof(bst_float),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
prediction_cache_initialised = true;
|
||||
@@ -952,16 +956,16 @@ struct DeviceShard {
|
||||
CalcWeightTrainParam param_d(param);
|
||||
|
||||
dh::safe_cuda(
|
||||
cudaMemcpyAsync(node_sum_gradients_d.Data(), node_sum_gradients.data(),
|
||||
cudaMemcpyAsync(node_sum_gradients_d.data(), node_sum_gradients.data(),
|
||||
sizeof(GradientPair) * node_sum_gradients.size(),
|
||||
cudaMemcpyHostToDevice));
|
||||
auto d_position = position.Current();
|
||||
auto d_ridx = ridx.Current();
|
||||
auto d_node_sum_gradients = node_sum_gradients_d.Data();
|
||||
auto d_prediction_cache = prediction_cache.Data();
|
||||
auto d_node_sum_gradients = node_sum_gradients_d.data();
|
||||
auto d_prediction_cache = prediction_cache.data();
|
||||
|
||||
dh::LaunchN(
|
||||
device_id, prediction_cache.Size(), [=] __device__(int local_idx) {
|
||||
device_id, prediction_cache.size(), [=] __device__(int local_idx) {
|
||||
int pos = d_position[local_idx];
|
||||
bst_float weight = CalcWeight(param_d, d_node_sum_gradients[pos]);
|
||||
d_prediction_cache[d_ridx[local_idx]] +=
|
||||
@@ -969,8 +973,8 @@ struct DeviceShard {
|
||||
});
|
||||
|
||||
dh::safe_cuda(cudaMemcpy(
|
||||
out_preds_d, prediction_cache.Data(),
|
||||
prediction_cache.Size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
out_preds_d, prediction_cache.data(),
|
||||
prediction_cache.size() * sizeof(bst_float), cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -981,7 +985,7 @@ struct SharedMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
||||
auto segment_begin = segment.begin;
|
||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx);
|
||||
auto d_ridx = shard->ridx.Current();
|
||||
auto d_gpair = shard->gpair.Data();
|
||||
auto d_gpair = shard->gpair.data();
|
||||
|
||||
auto n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
|
||||
|
||||
@@ -1006,7 +1010,7 @@ struct GlobalMemHistBuilder : public GPUHistBuilderBase<GradientSumT> {
|
||||
Segment segment = shard->ridx_segments[nidx];
|
||||
auto d_node_hist = shard->hist.GetNodeHistogram(nidx).data();
|
||||
bst_uint* d_ridx = shard->ridx.Current();
|
||||
GradientPair* d_gpair = shard->gpair.Data();
|
||||
GradientPair* d_gpair = shard->gpair.data();
|
||||
|
||||
size_t const n_elements = segment.Size() * shard->ellpack_matrix.row_stride;
|
||||
auto d_matrix = shard->ellpack_matrix;
|
||||
@@ -1043,10 +1047,11 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
&gidx_fvalue_map, hmat.cut.size(),
|
||||
&min_fvalue, hmat.min_val.size(),
|
||||
&monotone_constraints, param.monotone_constraints.size());
|
||||
gidx_fvalue_map = hmat.cut;
|
||||
min_fvalue = hmat.min_val;
|
||||
feature_segments = hmat.row_ptr;
|
||||
monotone_constraints = param.monotone_constraints;
|
||||
|
||||
dh::CopyVectorToDeviceSpan(gidx_fvalue_map, hmat.cut);
|
||||
dh::CopyVectorToDeviceSpan(min_fvalue, hmat.min_val);
|
||||
dh::CopyVectorToDeviceSpan(feature_segments, hmat.row_ptr);
|
||||
dh::CopyVectorToDeviceSpan(monotone_constraints, param.monotone_constraints);
|
||||
|
||||
node_sum_gradients.resize(max_nodes);
|
||||
ridx_segments.resize(max_nodes);
|
||||
@@ -1063,14 +1068,16 @@ inline void DeviceShard<GradientSumT>::InitCompressedData(
|
||||
<< "Max leaves and max depth cannot both be unconstrained for "
|
||||
"gpu_hist.";
|
||||
ba.Allocate(device_id, &gidx_buffer, compressed_size_bytes);
|
||||
gidx_buffer.Fill(0);
|
||||
thrust::fill(
|
||||
thrust::device_pointer_cast(gidx_buffer.data()),
|
||||
thrust::device_pointer_cast(gidx_buffer.data() + gidx_buffer.size()), 0);
|
||||
|
||||
this->CreateHistIndices(row_batch, row_stride, null_gidx_value);
|
||||
|
||||
ellpack_matrix.Init(
|
||||
feature_segments.GetSpan(), min_fvalue.GetSpan(),
|
||||
gidx_fvalue_map.GetSpan(), row_stride,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.Data(), num_symbols),
|
||||
feature_segments, min_fvalue,
|
||||
gidx_fvalue_map, row_stride,
|
||||
common::CompressedIterator<uint32_t>(gidx_buffer.data(), num_symbols),
|
||||
is_dense, null_gidx_value);
|
||||
|
||||
// check if we can use shared memory for building histograms
|
||||
@@ -1121,10 +1128,10 @@ inline void DeviceShard<GradientSumT>::CreateHistIndices(
|
||||
dh::DivRoundUp(row_stride, block3.y), 1);
|
||||
CompressBinEllpackKernel<<<grid3, block3>>>
|
||||
(common::CompressedBufferWriter(num_symbols),
|
||||
gidx_buffer.Data(),
|
||||
gidx_buffer.data(),
|
||||
row_ptrs.data().get() + batch_row_begin,
|
||||
entries_d.data().get(),
|
||||
gidx_fvalue_map.Data(), feature_segments.Data(),
|
||||
gidx_fvalue_map.data(), feature_segments.data(),
|
||||
batch_row_begin, batch_nrows,
|
||||
row_ptrs[batch_row_begin],
|
||||
row_stride, null_gidx_value);
|
||||
@@ -1355,7 +1362,7 @@ class GPUHistMakerSpecialised{
|
||||
[&](int i, std::unique_ptr<DeviceShard<GradientSumT>>& shard) {
|
||||
dh::safe_cuda(cudaSetDevice(shard->device_id));
|
||||
tmp_sums[i] = dh::SumReduction(
|
||||
shard->temp_memory, shard->gpair.Data(), shard->gpair.Size());
|
||||
shard->temp_memory, shard->gpair.data(), shard->gpair.size());
|
||||
});
|
||||
|
||||
GradientPair sum_gradient =
|
||||
|
||||
Reference in New Issue
Block a user