Export Python Interface for external memory. (#7070)

* Add Python iterator interface.
* Add tests.
* Add demo.
* Add documents.
* Handle empty dataset.
This commit is contained in:
Jiaming Yuan
2021-07-22 15:15:53 +08:00
committed by GitHub
parent e64ee6592f
commit e6088366df
34 changed files with 961 additions and 200 deletions

View File

@@ -874,8 +874,15 @@ SparsePage SparsePage::GetTranspose(int num_columns) const {
tid);
}
});
if (this->data.Empty()) {
transpose.offset.Resize(num_columns + 1);
transpose.offset.Fill(0);
}
CHECK_EQ(transpose.offset.Size(), num_columns + 1);
return transpose;
}
void SparsePage::Push(const SparsePage &batch) {
auto& data_vec = data.HostVector();
auto& offset_vec = offset.HostVector();
@@ -1007,6 +1014,7 @@ void SparsePage::PushCSC(const SparsePage &batch) {
auto const& other_offset = batch.offset.ConstHostVector();
if (other_data.empty()) {
self_offset = other_offset;
return;
}
if (!self_data.empty()) {

View File

@@ -19,11 +19,16 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
cudaPointerAttributes attr;
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
int32_t ptr_device = attr.device;
dh::safe_cuda(cudaSetDevice(ptr_device));
if (ptr_device >= 0) {
dh::safe_cuda(cudaSetDevice(ptr_device));
}
return ptr_device;
};
auto ptr_device = SetDeviceToPtr(column.data);
if (column.num_rows == 0) {
return;
}
out->SetDevice(ptr_device);
out->Resize(column.num_rows);
@@ -123,7 +128,12 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
<< "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1);
ArrayInterface array_interface(interface_str);
std::string key{c_key};
array_interface.AsColumnVector();
if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) ||
(array_interface.num_cols == 0 && array_interface.num_rows == 1))) {
// Not an empty column, transform it.
array_interface.AsColumnVector();
}
CHECK(!array_interface.valid.Data())
<< "Meta info " << key << " should be dense, found validity mask";
if (array_interface.num_rows == 0) {

View File

@@ -154,7 +154,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
size_t NumRows() const { return num_rows_; }
size_t NumColumns() const { return columns_.size(); }
size_t DeviceIdx() const { return device_idx_; }
int32_t DeviceIdx() const { return device_idx_; }
private:
CudfAdapterBatch batch_;
@@ -202,12 +202,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
size_t NumRows() const { return array_interface_.num_rows; }
size_t NumColumns() const { return array_interface_.num_cols; }
size_t DeviceIdx() const { return device_idx_; }
int32_t DeviceIdx() const { return device_idx_; }
private:
ArrayInterface array_interface_;
CupyAdapterBatch batch_;
int device_idx_;
int32_t device_idx_ {-1};
};
// Returns maximum row length

View File

@@ -10,6 +10,7 @@
namespace xgboost {
namespace data {
void EllpackPageSource::Fetch() {
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
if (!this->ReadCache()) {
auto const &csr = source_->Page();
this->page_.reset(new EllpackPage{});

View File

@@ -14,6 +14,9 @@ void DMatrixProxy::FromCudaColumnar(std::string interface_str) {
device_ = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (device_ < 0) {
CHECK_EQ(this->Info().num_row_, 0);
}
}
void DMatrixProxy::FromCudaArray(std::string interface_str) {
@@ -22,6 +25,9 @@ void DMatrixProxy::FromCudaArray(std::string interface_str) {
device_ = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
this->Info().num_row_ = adapter->NumRows();
if (device_ < 0) {
CHECK_EQ(this->Info().num_row_, 0);
}
}
} // namespace data

View File

@@ -141,9 +141,8 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
} else {
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
}
auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
proxy->Adapter())->Value();
return fn(value);
return std::result_of_t<Fn(
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
}
}
} // namespace data

View File

@@ -16,7 +16,10 @@ namespace data {
// be supported in future. Does not currently support inferring row/column size
template <typename AdapterT>
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
dh::safe_cuda(cudaSetDevice(adapter->DeviceIdx()));
auto device =
adapter->DeviceIdx() < 0 ? dh::CurrentDevice() : adapter->DeviceIdx();
CHECK_GE(device, 0);
dh::safe_cuda(cudaSetDevice(device));
CHECK(adapter->NumRows() != kAdapterUnknownSize);
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -27,8 +30,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
// Enforce single batch
CHECK(!adapter->Next());
info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(),
missing, sparse_page_.get());
info_.num_nonzero_ =
CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
info_.num_col_ = adapter->NumColumns();
info_.num_row_ = adapter->NumRows();
// Synchronise worker columns

View File

@@ -15,6 +15,29 @@ MetaInfo &SparsePageDMatrix::Info() { return info_; }
const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
namespace detail {
// Use device dispatch
size_t NSamplesDevice(DMatrixProxy *proxy)
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else
{
common::AssertGPUSupport();
return 0;
}
#endif
size_t NFeaturesDevice(DMatrixProxy *proxy)
#if defined(XGBOOST_USE_CUDA)
; // NOLINT
#else
{
common::AssertGPUSupport();
return 0;
}
#endif
} // namespace detail
SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy_handle,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
@@ -35,13 +58,24 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
size_t nnz = 0;
auto num_rows = [&]() {
return HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumRows(); });
bool type_error {false};
size_t n_samples = HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumRows(); }, &type_error);
if (type_error) {
n_samples = detail::NSamplesDevice(proxy);
}
return n_samples;
};
auto num_cols = [&]() {
return HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumCols(); });
bool type_error {false};
size_t n_features = HostAdapterDispatch(
proxy, [](auto const &value) { return value.NumCols(); }, &type_error);
if (type_error) {
n_features = detail::NFeaturesDevice(proxy);
}
return n_features;
};
// the proxy is iterated together with the sparse page source so we can obtain all
// information in 1 pass.
for (auto const &page : this->GetRowBatchesImpl()) {

View File

@@ -7,8 +7,24 @@
namespace xgboost {
namespace data {
namespace detail {
size_t NSamplesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
}
size_t NFeaturesDevice(DMatrixProxy *proxy) {
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
}
} // namespace detail
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
auto device = proxy->DeviceIdx();
if (device < 0) {
device = dh::CurrentDevice();
}
CHECK_GE(device, 0);
Dispatch(proxy, [&](auto const &value) {
CopyToSparsePage(value, device, missing, page);
});

View File

@@ -236,7 +236,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
iter_{iter}, proxy_{proxy} {
if (!cache_info_->written) {
iter_.Reset();
iter_.Next();
CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
}
this->Fetch();
}