Implement GPU accelerated coordinate descent algorithm (#3178)

* Implement GPU accelerated coordinate descent algorithm. 

* Exclude external memory tests for GPU
This commit is contained in:
Rory Mitchell
2018-04-20 14:56:35 +12:00
committed by GitHub
parent ccf80703ef
commit a185ddfe03
12 changed files with 473 additions and 63 deletions

View File

@@ -374,10 +374,10 @@ class DVec {
safe_cuda(cudaSetDevice(this->DeviceIdx()));
if (end - begin != Size()) {
throw std::runtime_error(
"Cannot copy assign vector to DVec, sizes are different");
"Cannot copy assign vector to dvec, sizes are different");
}
safe_cuda(cudaMemcpy(this->Data(), begin.get(),
Size() * sizeof(T), cudaMemcpyDefault));
safe_cuda(cudaMemcpy(this->Data(), begin.get(), Size() * sizeof(T),
cudaMemcpyDefault));
}
};
@@ -544,7 +544,7 @@ struct CubMemory {
size_t temp_storage_bytes;
// Thrust
using ValueT = char;
using value_type = char; // NOLINT
CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}
@@ -807,18 +807,20 @@ void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
* @param nVals number of elements in the input array
*/
template <typename T>
T SumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
typename std::iterator_traits<T>::value_type SumReduction(dh::CubMemory &tmp_mem, T in, int nVals) {
using ValueT = typename std::iterator_traits<T>::value_type;
size_t tmpSize;
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
// Allocate small extra memory for the return value
tmp_mem.LazyAllocate(tmpSize + sizeof(T));
auto ptr = reinterpret_cast<T *>(tmp_mem.d_temp_storage) + 1;
tmp_mem.LazyAllocate(tmpSize + sizeof(ValueT));
auto ptr = reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage) + 1;
dh::safe_cuda(cub::DeviceReduce::Sum(
reinterpret_cast<void *>(ptr), tmpSize, in,
reinterpret_cast<T *>(tmp_mem.d_temp_storage), nVals));
T sum;
dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(T),
cudaMemcpyDeviceToHost));
reinterpret_cast<void *>(ptr), tmpSize, in,
reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage),
nVals));
ValueT sum;
dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(ValueT),
cudaMemcpyDeviceToHost));
return sum;
}
@@ -876,7 +878,8 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
* \class AllReducer
*
* \brief All reducer class that manages its own communication group and
* streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
* streams. Must be initialised before use. If XGBoost is compiled without NCCL
* this is a dummy class that will error if used with more than one GPU.
*/
class AllReducer {
@@ -912,7 +915,8 @@ class AllReducer {
}
initialised = true;
#else
CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
CHECK_EQ(device_ordinals.size(), 1)
<< "XGBoost must be compiled with NCCL to use more than one GPU.";
#endif
}
~AllReducer() {
@@ -929,16 +933,13 @@ class AllReducer {
}
/**
* \fn void AllReduceSum(int communication_group_idx, const double *sendbuff,
* double *recvbuff, int count)
*
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
* streams or comms.
*
* \param communication_group_idx Zero-based index of the
* communication group. \param sendbuff The sendbuff. \param
* sendbuff The sendbuff. \param [in,out] recvbuff
* The recvbuff. \param count Number of.
* \param communication_group_idx Zero-based index of the communication group.
* \param sendbuff The sendbuff.
* \param recvbuff The recvbuff.
* \param count Number of elements.
*/
void AllReduceSum(int communication_group_idx, const double *sendbuff,
@@ -954,17 +955,14 @@ class AllReducer {
}
/**
* \fn void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
*
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
*
* \param communication_group_idx Zero-based index of the communication group. \param
* sendbuff The sendbuff. \param sendbuff
* The sendbuff. \param [in,out] recvbuff The recvbuff.
* \param count Number of.
* \param sendbuff The sendbuff.
* \param [in,out] recvbuff If non-null, the recvbuff.
* \param count Number of.
* \param count Number of.
*
* \param communication_group_idx Zero-based index of the communication group. \param sendbuff.
* \param sendbuff The sendbuff.
* \param recvbuff The recvbuff.
* \param count Number of.
*/
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
@@ -993,4 +991,53 @@ class AllReducer {
#endif
}
};
/**
* \brief Executes some operation on each element of the input vector, using a
* single controlling thread for each element.
*
* \tparam T Generic type parameter.
* \tparam FunctionT Type of the function t.
* \param shards The shards.
* \param f The func_t to process.
*/
template <typename T, typename FunctionT>
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
auto previous_num_threads = omp_get_max_threads();
omp_set_num_threads(shards->size());
#pragma omp parallel
{
auto cpu_thread_id = omp_get_thread_num();
f(shards->at(cpu_thread_id));
}
omp_set_num_threads(previous_num_threads);
}
/**
* \brief Executes some operation on each element of the input vector, using a single controlling
* thread for each element, returns the sum of the results.
*
* \tparam ReduceT Type of the reduce t.
* \tparam T Generic type parameter.
* \tparam FunctionT Type of the function t.
* \param shards The shards.
* \param f The func_t to process.
*
* \return A reduce_t.
*/
template <typename ReduceT,typename T, typename FunctionT>
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
auto previous_num_threads = omp_get_max_threads();
omp_set_num_threads(shards->size());
std::vector<ReduceT> sums(shards->size());
#pragma omp parallel
{
auto cpu_thread_id = omp_get_thread_num();
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
}
omp_set_num_threads(previous_num_threads);
return std::accumulate(sums.begin(), sums.end(), ReduceT());
}
} // namespace dh