Implement GPU accelerated coordinate descent algorithm (#3178)
* Implement GPU accelerated coordinate descent algorithm. * Exclude external memory tests for GPU
This commit is contained in:
@@ -374,10 +374,10 @@ class DVec {
|
||||
safe_cuda(cudaSetDevice(this->DeviceIdx()));
|
||||
if (end - begin != Size()) {
|
||||
throw std::runtime_error(
|
||||
"Cannot copy assign vector to DVec, sizes are different");
|
||||
"Cannot copy assign vector to dvec, sizes are different");
|
||||
}
|
||||
safe_cuda(cudaMemcpy(this->Data(), begin.get(),
|
||||
Size() * sizeof(T), cudaMemcpyDefault));
|
||||
safe_cuda(cudaMemcpy(this->Data(), begin.get(), Size() * sizeof(T),
|
||||
cudaMemcpyDefault));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -544,7 +544,7 @@ struct CubMemory {
|
||||
size_t temp_storage_bytes;
|
||||
|
||||
// Thrust
|
||||
using ValueT = char;
|
||||
using value_type = char; // NOLINT
|
||||
|
||||
CubMemory() : d_temp_storage(nullptr), temp_storage_bytes(0) {}
|
||||
|
||||
@@ -807,18 +807,20 @@ void SumReduction(dh::CubMemory &tmp_mem, dh::DVec<T> &in, dh::DVec<T> &out,
|
||||
* @param nVals number of elements in the input array
|
||||
*/
|
||||
template <typename T>
|
||||
T SumReduction(dh::CubMemory &tmp_mem, T *in, int nVals) {
|
||||
typename std::iterator_traits<T>::value_type SumReduction(dh::CubMemory &tmp_mem, T in, int nVals) {
|
||||
using ValueT = typename std::iterator_traits<T>::value_type;
|
||||
size_t tmpSize;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, tmpSize, in, in, nVals));
|
||||
// Allocate small extra memory for the return value
|
||||
tmp_mem.LazyAllocate(tmpSize + sizeof(T));
|
||||
auto ptr = reinterpret_cast<T *>(tmp_mem.d_temp_storage) + 1;
|
||||
tmp_mem.LazyAllocate(tmpSize + sizeof(ValueT));
|
||||
auto ptr = reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage) + 1;
|
||||
dh::safe_cuda(cub::DeviceReduce::Sum(
|
||||
reinterpret_cast<void *>(ptr), tmpSize, in,
|
||||
reinterpret_cast<T *>(tmp_mem.d_temp_storage), nVals));
|
||||
T sum;
|
||||
dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(T),
|
||||
cudaMemcpyDeviceToHost));
|
||||
reinterpret_cast<void *>(ptr), tmpSize, in,
|
||||
reinterpret_cast<ValueT *>(tmp_mem.d_temp_storage),
|
||||
nVals));
|
||||
ValueT sum;
|
||||
dh::safe_cuda(cudaMemcpy(&sum, tmp_mem.d_temp_storage, sizeof(ValueT),
|
||||
cudaMemcpyDeviceToHost));
|
||||
return sum;
|
||||
}
|
||||
|
||||
@@ -876,7 +878,8 @@ void Gather(int device_idx, T *out, const T *in, const int *instId, int nVals) {
|
||||
* \class AllReducer
|
||||
*
|
||||
* \brief All reducer class that manages its own communication group and
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL this is a dummy class that will error if used with more than one GPU.
|
||||
* streams. Must be initialised before use. If XGBoost is compiled without NCCL
|
||||
* this is a dummy class that will error if used with more than one GPU.
|
||||
*/
|
||||
|
||||
class AllReducer {
|
||||
@@ -912,7 +915,8 @@ class AllReducer {
|
||||
}
|
||||
initialised = true;
|
||||
#else
|
||||
CHECK_EQ(device_ordinals.size(), 1) << "XGBoost must be compiled with NCCL to use more than one GPU.";
|
||||
CHECK_EQ(device_ordinals.size(), 1)
|
||||
<< "XGBoost must be compiled with NCCL to use more than one GPU.";
|
||||
#endif
|
||||
}
|
||||
~AllReducer() {
|
||||
@@ -929,16 +933,13 @@ class AllReducer {
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void AllReduceSum(int communication_group_idx, const double *sendbuff,
|
||||
* double *recvbuff, int count)
|
||||
*
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing
|
||||
* streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the
|
||||
* communication group. \param sendbuff The sendbuff. \param
|
||||
* sendbuff The sendbuff. \param [in,out] recvbuff
|
||||
* The recvbuff. \param count Number of.
|
||||
* \param communication_group_idx Zero-based index of the communication group.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of elements.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const double *sendbuff,
|
||||
@@ -954,17 +955,14 @@ class AllReducer {
|
||||
}
|
||||
|
||||
/**
|
||||
* \fn void AllReduceSum(int communication_group_idx, const int64_t *sendbuff, int64_t *recvbuff, int count)
|
||||
*
|
||||
* \brief Allreduce. Use in exactly the same way as NCCL but without needing streams or comms.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group. \param
|
||||
* sendbuff The sendbuff. \param sendbuff
|
||||
* The sendbuff. \param [in,out] recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param [in,out] recvbuff If non-null, the recvbuff.
|
||||
* \param count Number of.
|
||||
* \param count Number of.
|
||||
*
|
||||
* \param communication_group_idx Zero-based index of the communication group. \param sendbuff.
|
||||
* \param sendbuff The sendbuff.
|
||||
* \param recvbuff The recvbuff.
|
||||
* \param count Number of.
|
||||
*/
|
||||
|
||||
void AllReduceSum(int communication_group_idx, const int64_t *sendbuff,
|
||||
@@ -993,4 +991,53 @@ class AllReducer {
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a
|
||||
* single controlling thread for each element.
|
||||
*
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*/
|
||||
|
||||
template <typename T, typename FunctionT>
|
||||
void ExecuteShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
f(shards->at(cpu_thread_id));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Executes some operation on each element of the input vector, using a single controlling
|
||||
* thread for each element, returns the sum of the results.
|
||||
*
|
||||
* \tparam ReduceT Type of the reduce t.
|
||||
* \tparam T Generic type parameter.
|
||||
* \tparam FunctionT Type of the function t.
|
||||
* \param shards The shards.
|
||||
* \param f The func_t to process.
|
||||
*
|
||||
* \return A reduce_t.
|
||||
*/
|
||||
|
||||
template <typename ReduceT,typename T, typename FunctionT>
|
||||
ReduceT ReduceShards(std::vector<T> *shards, FunctionT f) {
|
||||
auto previous_num_threads = omp_get_max_threads();
|
||||
omp_set_num_threads(shards->size());
|
||||
std::vector<ReduceT> sums(shards->size());
|
||||
#pragma omp parallel
|
||||
{
|
||||
auto cpu_thread_id = omp_get_thread_num();
|
||||
sums[cpu_thread_id] = f(shards->at(cpu_thread_id));
|
||||
}
|
||||
omp_set_num_threads(previous_num_threads);
|
||||
return std::accumulate(sums.begin(), sums.end(), ReduceT());
|
||||
}
|
||||
} // namespace dh
|
||||
|
||||
Reference in New Issue
Block a user