This makes GPU Hist robust in distributed environment as some workers might not be associated with any data in either training or evaluation. * Disable rabit mock test for now: See #5012 . * Disable dask-cudf test at prediction for now: See #5003 * Launch dask job for all workers despite they might not have any data. * Check 0 rows in elementwise evaluation metrics. Using AUC and AUC-PR still throws an error. See #4663 for a robust fix. * Add tests for edge cases. * Add `LaunchKernel` wrapper handling zero sized grid. * Move some parts of allreducer into a cu file. * Don't validate feature names when the booster is empty. * Sync number of columns in DMatrix. As num_feature is required to be the same across all workers in data split mode. * Filtering in dask interface now by default syncs all booster that's not empty, instead of using rank 0. * Fix Jenkins' GPU tests. * Install dask-cuda from source in Jenkins' test. Now all tests are actually running. * Restore GPU Hist tree synchronization test. * Check UUID of running devices. The check is only performed on CUDA version >= 10.x, as 9.x doesn't have UUID field. * Fix CMake policy and project variables. Use xgboost_SOURCE_DIR uniformly, add policy for CMake >= 3.13. * Fix copying data to CPU * Fix race condition in cpu predictor. * Fix duplicated DMatrix construction. * Don't download extra nccl in CI script.
276 lines
9.1 KiB
Plaintext
276 lines
9.1 KiB
Plaintext
/*!
|
|
* Copyright 2019 by XGBoost Contributors
|
|
*/
|
|
|
|
#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
|
|
#define XGBOOST_DATA_ELLPACK_PAGE_H_
|
|
|
|
#include <xgboost/data.h>
|
|
|
|
#include "../common/compressed_iterator.h"
|
|
#include "../common/device_helpers.cuh"
|
|
#include "../common/hist_util.h"
|
|
|
|
namespace xgboost {
|
|
|
|
// Find a gidx value for a given feature otherwise return -1 if not found
|
|
__forceinline__ __device__ int BinarySearchRow(
|
|
bst_uint begin, bst_uint end,
|
|
common::CompressedIterator<uint32_t> data,
|
|
int const fidx_begin, int const fidx_end) {
|
|
bst_uint previous_middle = UINT32_MAX;
|
|
while (end != begin) {
|
|
auto middle = begin + (end - begin) / 2;
|
|
if (middle == previous_middle) {
|
|
break;
|
|
}
|
|
previous_middle = middle;
|
|
|
|
auto gidx = data[middle];
|
|
|
|
if (gidx >= fidx_begin && gidx < fidx_end) {
|
|
return gidx;
|
|
} else if (gidx < fidx_begin) {
|
|
begin = middle;
|
|
} else {
|
|
end = middle;
|
|
}
|
|
}
|
|
// Value is missing
|
|
return -1;
|
|
}
|
|
|
|
/** \brief Meta information about the ELLPACK matrix. */
|
|
struct EllpackInfo {
|
|
/*! \brief Whether or not if the matrix is dense. */
|
|
bool is_dense;
|
|
/*! \brief Row length for ELLPack, equal to number of features. */
|
|
size_t row_stride;
|
|
/*! \brief Total number of bins, also used as the null index value, . */
|
|
size_t n_bins;
|
|
/*! \brief Minimum value for each feature. Size equals to number of features. */
|
|
common::Span<bst_float> min_fvalue;
|
|
/*! \brief Histogram cut pointers. Size equals to (number of features + 1). */
|
|
common::Span<uint32_t> feature_segments;
|
|
/*! \brief Histogram cut values. Size equals to (bins per feature * number of features). */
|
|
common::Span<bst_float> gidx_fvalue_map;
|
|
|
|
EllpackInfo() = default;
|
|
|
|
/*!
|
|
* \brief Constructor.
|
|
*
|
|
* @param device The GPU device to use.
|
|
* @param is_dense Whether the matrix is dense.
|
|
* @param row_stride The number of features between starts of consecutive rows.
|
|
* @param hmat The histogram cuts of all the features.
|
|
* @param ba The BulkAllocator that owns the GPU memory.
|
|
*/
|
|
explicit EllpackInfo(int device,
|
|
bool is_dense,
|
|
size_t row_stride,
|
|
const common::HistogramCuts& hmat,
|
|
dh::BulkAllocator* ba);
|
|
};
|
|
|
|
/** \brief Struct for accessing and manipulating an ellpack matrix on the
|
|
* device. Does not own underlying memory and may be trivially copied into
|
|
* kernels.*/
|
|
struct EllpackMatrix {
|
|
EllpackInfo info;
|
|
common::CompressedIterator<uint32_t> gidx_iter;
|
|
|
|
XGBOOST_DEVICE size_t BinCount() const { return info.gidx_fvalue_map.size(); }
|
|
|
|
// Get a matrix element, uses binary search for look up Return NaN if missing
|
|
// Given a row index and a feature index, returns the corresponding cut value
|
|
__device__ bst_float GetElement(size_t ridx, size_t fidx) const {
|
|
auto row_begin = info.row_stride * ridx;
|
|
auto row_end = row_begin + info.row_stride;
|
|
auto gidx = -1;
|
|
if (info.is_dense) {
|
|
gidx = gidx_iter[row_begin + fidx];
|
|
} else {
|
|
gidx = BinarySearchRow(row_begin,
|
|
row_end,
|
|
gidx_iter,
|
|
info.feature_segments[fidx],
|
|
info.feature_segments[fidx + 1]);
|
|
}
|
|
if (gidx == -1) {
|
|
return nan("");
|
|
}
|
|
return info.gidx_fvalue_map[gidx];
|
|
}
|
|
};
|
|
|
|
// Instances of this type are created while creating the histogram bins for the
|
|
// entire dataset across multiple sparse page batches. This keeps track of the number
|
|
// of rows to process from a batch and the position from which to process on each device.
|
|
struct RowStateOnDevice {
|
|
// Number of rows assigned to this device
|
|
size_t total_rows_assigned_to_device;
|
|
// Number of rows processed thus far
|
|
size_t total_rows_processed;
|
|
// Number of rows to process from the current sparse page batch
|
|
size_t rows_to_process_from_batch;
|
|
// Offset from the current sparse page batch to begin processing
|
|
size_t row_offset_in_current_batch;
|
|
|
|
explicit RowStateOnDevice(size_t total_rows)
|
|
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
|
rows_to_process_from_batch(0), row_offset_in_current_batch(0) {
|
|
}
|
|
|
|
explicit RowStateOnDevice(size_t total_rows, size_t batch_rows)
|
|
: total_rows_assigned_to_device(total_rows), total_rows_processed(0),
|
|
rows_to_process_from_batch(batch_rows), row_offset_in_current_batch(0) {
|
|
}
|
|
|
|
// Advance the row state by the number of rows processed
|
|
void Advance() {
|
|
total_rows_processed += rows_to_process_from_batch;
|
|
CHECK_LE(total_rows_processed, total_rows_assigned_to_device);
|
|
rows_to_process_from_batch = row_offset_in_current_batch = 0;
|
|
}
|
|
};
|
|
|
|
// An instance of this type is created which keeps track of total number of rows to process,
|
|
// rows processed thus far, rows to process and the offset from the current sparse page batch
|
|
// to begin processing on each device
|
|
class DeviceHistogramBuilderState {
|
|
public:
|
|
explicit DeviceHistogramBuilderState(size_t n_rows) : device_row_state_(n_rows) {}
|
|
|
|
const RowStateOnDevice& GetRowStateOnDevice() const {
|
|
return device_row_state_;
|
|
}
|
|
|
|
// This method is invoked at the beginning of each sparse page batch. This distributes
|
|
// the rows in the sparse page to the device.
|
|
// TODO(sriramch): Think of a way to utilize *all* the GPUs to build the compressed bins.
|
|
void BeginBatch(const SparsePage &batch) {
|
|
size_t rem_rows = batch.Size();
|
|
size_t row_offset_in_current_batch = 0;
|
|
|
|
// Do we have anymore left to process from this batch on this device?
|
|
if (device_row_state_.total_rows_assigned_to_device > device_row_state_.total_rows_processed) {
|
|
// There are still some rows that needs to be assigned to this device
|
|
device_row_state_.rows_to_process_from_batch =
|
|
std::min(
|
|
device_row_state_.total_rows_assigned_to_device - device_row_state_.total_rows_processed,
|
|
rem_rows);
|
|
} else {
|
|
// All rows have been assigned to this device
|
|
device_row_state_.rows_to_process_from_batch = 0;
|
|
}
|
|
|
|
device_row_state_.row_offset_in_current_batch = row_offset_in_current_batch;
|
|
row_offset_in_current_batch += device_row_state_.rows_to_process_from_batch;
|
|
rem_rows -= device_row_state_.rows_to_process_from_batch;
|
|
}
|
|
|
|
// This method is invoked after completion of each sparse page batch
|
|
void EndBatch() {
|
|
device_row_state_.Advance();
|
|
}
|
|
|
|
private:
|
|
RowStateOnDevice device_row_state_{0};
|
|
};
|
|
|
|
class EllpackPageImpl {
|
|
public:
|
|
EllpackMatrix matrix;
|
|
/*! \brief global index of histogram, which is stored in ELLPack format. */
|
|
common::Span<common::CompressedByteT> gidx_buffer;
|
|
std::vector<common::CompressedByteT> idx_buffer;
|
|
size_t n_rows{};
|
|
|
|
/*!
|
|
* \brief Default constructor.
|
|
*
|
|
* This is used in the external memory case. An empty ELLPACK page is constructed with its content
|
|
* set later by the reader.
|
|
*/
|
|
EllpackPageImpl() = default;
|
|
|
|
/*!
|
|
* \brief Constructor from an existing DMatrix.
|
|
*
|
|
* This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
|
|
* in CSR format.
|
|
*/
|
|
explicit EllpackPageImpl(DMatrix* dmat, const BatchParam& parm);
|
|
|
|
/*!
|
|
* \brief Initialize the EllpackInfo contained in the EllpackMatrix.
|
|
*
|
|
* This is used in the in-memory case. The current page owns the BulkAllocator, which in turn owns
|
|
* the GPU memory used by the EllpackInfo.
|
|
*
|
|
* @param device The GPU device to use.
|
|
* @param is_dense Whether the matrix is dense.
|
|
* @param row_stride The number of features between starts of consecutive rows.
|
|
* @param hmat The histogram cuts of all the features.
|
|
*/
|
|
void InitInfo(int device, bool is_dense, size_t row_stride, const common::HistogramCuts& hmat);
|
|
|
|
/*!
|
|
* \brief Initialize the buffer to store compressed features.
|
|
*
|
|
* @param device The GPU device to use.
|
|
* @param num_rows The number of rows we are storing in the buffer.
|
|
*/
|
|
void InitCompressedData(int device, size_t num_rows);
|
|
|
|
/*!
|
|
* \brief Compress a single page of CSR data into ELLPACK.
|
|
*
|
|
* @param device The GPU device to use.
|
|
* @param row_batch The CSR page.
|
|
* @param device_row_state On-device data for maintaining state.
|
|
*/
|
|
void CreateHistIndices(int device,
|
|
const SparsePage& row_batch,
|
|
const RowStateOnDevice& device_row_state);
|
|
|
|
/*! \return Number of instances in the page. */
|
|
size_t Size() const;
|
|
|
|
/*! \brief Set the base row id for this page. */
|
|
inline void SetBaseRowId(size_t row_id) {
|
|
base_rowid_ = row_id;
|
|
}
|
|
|
|
/*! \brief clear the page. */
|
|
void Clear();
|
|
|
|
/*!
|
|
* \brief Push a sparse page.
|
|
* \param batch The row page.
|
|
*/
|
|
void Push(int device, const SparsePage& batch);
|
|
|
|
/*! \return Estimation of memory cost of this page. */
|
|
size_t MemCostBytes() const;
|
|
|
|
/*!
|
|
* \brief Copy the ELLPACK matrix to GPU.
|
|
*
|
|
* @param device The GPU device to use.
|
|
* @param info The EllpackInfo for the matrix.
|
|
*/
|
|
void InitDevice(int device, EllpackInfo info);
|
|
|
|
private:
|
|
common::Monitor monitor_;
|
|
dh::BulkAllocator ba_;
|
|
size_t base_rowid_{};
|
|
bool device_initialized_{false};
|
|
};
|
|
|
|
} // namespace xgboost
|
|
|
|
#endif // XGBOOST_DATA_ELLPACK_PAGE_H_
|