Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage. (#3446)

* Replaced std::vector with HostDeviceVector in MetaInfo and SparsePage.

- added distributions to HostDeviceVector
- using HostDeviceVector for labels, weights and base margings in MetaInfo
- using HostDeviceVector for offset and data in SparsePage
- other necessary refactoring

* Added const version of HostDeviceVector API calls.

- const versions added to calls that can trigger data transfers, e.g. DevicePointer()
- updated the code that uses HostDeviceVector
- objective functions now accept const HostDeviceVector<bst_float>& for predictions

* Updated src/linear/updater_gpu_coordinate.cu.

* Added read-only state for HostDeviceVector sync.

- this means no copies are performed if both host and devices access
  the HostDeviceVector read-only

* Fixed linter and test errors.

- updated the lz4 plugin
- added ConstDeviceSpan to HostDeviceVector
- using device % dh::NVisibleDevices() for the physical device number,
  e.g. in calls to cudaSetDevice()

* Fixed explicit template instantiation errors for HostDeviceVector.

- replaced HostDeviceVector<unsigned int> with HostDeviceVector<int>

* Fixed HostDeviceVector tests that require multiple GPUs.

- added a mock set device handler; when set, it is called instead of cudaSetDevice()
This commit is contained in:
Andy Adinets
2018-08-30 04:28:47 +02:00
committed by Rory Mitchell
parent 58d783df16
commit 72cd1517d6
45 changed files with 1141 additions and 560 deletions

View File

@@ -387,11 +387,13 @@ struct DeviceShard {
void InitRowPtrs(const SparsePage& row_batch) {
dh::safe_cuda(cudaSetDevice(device_idx));
const auto& offset_vec = row_batch.offset.HostVector();
row_ptrs.resize(n_rows + 1);
thrust::copy(row_batch.offset.data() + row_begin_idx,
row_batch.offset.data() + row_end_idx + 1,
thrust::copy(offset_vec.data() + row_begin_idx,
offset_vec.data() + row_end_idx + 1,
row_ptrs.begin());
auto row_iter = row_ptrs.begin();
// find the maximum row size
auto get_size = [=] __device__(size_t row) {
return row_iter[row + 1] - row_iter[row];
}; // NOLINT
@@ -432,9 +434,12 @@ struct DeviceShard {
(dh::TotalMemory(device_idx) / (16 * row_stride * sizeof(Entry)),
static_cast<size_t>(n_rows));
thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
const auto& offset_vec = row_batch.offset.HostVector();
const auto& data_vec = row_batch.data.HostVector();
thrust::device_vector<Entry> entries_d(gpu_batch_nrows * row_stride);
size_t gpu_nbatches = dh::DivRoundUp(n_rows, gpu_batch_nrows);
for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
size_t batch_row_end = (gpu_batch + 1) * gpu_batch_nrows;
@@ -443,12 +448,12 @@ struct DeviceShard {
}
size_t batch_nrows = batch_row_end - batch_row_begin;
size_t n_entries =
row_batch.offset[row_begin_idx + batch_row_end] -
row_batch.offset[row_begin_idx + batch_row_begin];
offset_vec[row_begin_idx + batch_row_end] -
offset_vec[row_begin_idx + batch_row_begin];
dh::safe_cuda
(cudaMemcpy
(entries_d.data().get(),
&row_batch.data[row_batch.offset[row_begin_idx + batch_row_begin]],
data_vec.data() + offset_vec[row_begin_idx + batch_row_begin],
n_entries * sizeof(Entry), cudaMemcpyDefault));
dim3 block3(32, 8, 1);
dim3 grid3(dh::DivRoundUp(n_rows, block3.x),
@@ -458,7 +463,7 @@ struct DeviceShard {
row_ptrs.data().get() + batch_row_begin,
entries_d.data().get(), cuts_d.data().get(), cut_row_ptrs_d.data().get(),
batch_row_begin, batch_nrows,
row_batch.offset[row_begin_idx + batch_row_begin],
offset_vec[row_begin_idx + batch_row_begin],
row_stride, null_gidx_value);
dh::safe_cuda(cudaGetLastError());
@@ -538,7 +543,7 @@ struct DeviceShard {
std::fill(ridx_segments.begin(), ridx_segments.end(), Segment(0, 0));
ridx_segments.front() = Segment(0, ridx.Size());
this->gpair.copy(dh_gpair->tbegin(device_idx), dh_gpair->tend(device_idx));
this->gpair.copy(dh_gpair->tcbegin(device_idx), dh_gpair->tcend(device_idx));
SubsampleGradientPair(&gpair, param.subsample, row_begin_idx);
hist.Reset();
}