Export Python Interface for external memory. (#7070)
* Add Python iterator interface. * Add tests. * Add demo. * Add documents. * Handle empty dataset.
This commit is contained in:
@@ -874,8 +874,15 @@ SparsePage SparsePage::GetTranspose(int num_columns) const {
|
||||
tid);
|
||||
}
|
||||
});
|
||||
|
||||
if (this->data.Empty()) {
|
||||
transpose.offset.Resize(num_columns + 1);
|
||||
transpose.offset.Fill(0);
|
||||
}
|
||||
CHECK_EQ(transpose.offset.Size(), num_columns + 1);
|
||||
return transpose;
|
||||
}
|
||||
|
||||
void SparsePage::Push(const SparsePage &batch) {
|
||||
auto& data_vec = data.HostVector();
|
||||
auto& offset_vec = offset.HostVector();
|
||||
@@ -1007,6 +1014,7 @@ void SparsePage::PushCSC(const SparsePage &batch) {
|
||||
auto const& other_offset = batch.offset.ConstHostVector();
|
||||
|
||||
if (other_data.empty()) {
|
||||
self_offset = other_offset;
|
||||
return;
|
||||
}
|
||||
if (!self_data.empty()) {
|
||||
|
||||
@@ -19,11 +19,16 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
|
||||
cudaPointerAttributes attr;
|
||||
dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
|
||||
int32_t ptr_device = attr.device;
|
||||
dh::safe_cuda(cudaSetDevice(ptr_device));
|
||||
if (ptr_device >= 0) {
|
||||
dh::safe_cuda(cudaSetDevice(ptr_device));
|
||||
}
|
||||
return ptr_device;
|
||||
};
|
||||
auto ptr_device = SetDeviceToPtr(column.data);
|
||||
|
||||
if (column.num_rows == 0) {
|
||||
return;
|
||||
}
|
||||
out->SetDevice(ptr_device);
|
||||
out->Resize(column.num_rows);
|
||||
|
||||
@@ -123,7 +128,12 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
|
||||
<< "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1);
|
||||
ArrayInterface array_interface(interface_str);
|
||||
std::string key{c_key};
|
||||
array_interface.AsColumnVector();
|
||||
if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) ||
|
||||
(array_interface.num_cols == 0 && array_interface.num_rows == 1))) {
|
||||
// Not an empty column, transform it.
|
||||
array_interface.AsColumnVector();
|
||||
}
|
||||
|
||||
CHECK(!array_interface.valid.Data())
|
||||
<< "Meta info " << key << " should be dense, found validity mask";
|
||||
if (array_interface.num_rows == 0) {
|
||||
|
||||
@@ -154,7 +154,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
|
||||
|
||||
size_t NumRows() const { return num_rows_; }
|
||||
size_t NumColumns() const { return columns_.size(); }
|
||||
size_t DeviceIdx() const { return device_idx_; }
|
||||
int32_t DeviceIdx() const { return device_idx_; }
|
||||
|
||||
private:
|
||||
CudfAdapterBatch batch_;
|
||||
@@ -202,12 +202,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
|
||||
|
||||
size_t NumRows() const { return array_interface_.num_rows; }
|
||||
size_t NumColumns() const { return array_interface_.num_cols; }
|
||||
size_t DeviceIdx() const { return device_idx_; }
|
||||
int32_t DeviceIdx() const { return device_idx_; }
|
||||
|
||||
private:
|
||||
ArrayInterface array_interface_;
|
||||
CupyAdapterBatch batch_;
|
||||
int device_idx_;
|
||||
int32_t device_idx_ {-1};
|
||||
};
|
||||
|
||||
// Returns maximum row length
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
void EllpackPageSource::Fetch() {
|
||||
dh::safe_cuda(cudaSetDevice(param_.gpu_id));
|
||||
if (!this->ReadCache()) {
|
||||
auto const &csr = source_->Page();
|
||||
this->page_.reset(new EllpackPage{});
|
||||
|
||||
@@ -14,6 +14,9 @@ void DMatrixProxy::FromCudaColumnar(std::string interface_str) {
|
||||
device_ = adapter->DeviceIdx();
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
if (device_ < 0) {
|
||||
CHECK_EQ(this->Info().num_row_, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void DMatrixProxy::FromCudaArray(std::string interface_str) {
|
||||
@@ -22,6 +25,9 @@ void DMatrixProxy::FromCudaArray(std::string interface_str) {
|
||||
device_ = adapter->DeviceIdx();
|
||||
this->Info().num_col_ = adapter->NumColumns();
|
||||
this->Info().num_row_ = adapter->NumRows();
|
||||
if (device_ < 0) {
|
||||
CHECK_EQ(this->Info().num_row_, 0);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace data
|
||||
|
||||
@@ -141,9 +141,8 @@ decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_
|
||||
} else {
|
||||
LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
|
||||
}
|
||||
auto value = dmlc::get<std::shared_ptr<ArrayAdapter>>(
|
||||
proxy->Adapter())->Value();
|
||||
return fn(value);
|
||||
return std::result_of_t<Fn(
|
||||
decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
|
||||
}
|
||||
}
|
||||
} // namespace data
|
||||
|
||||
@@ -16,7 +16,10 @@ namespace data {
|
||||
// be supported in future. Does not currently support inferring row/column size
|
||||
template <typename AdapterT>
|
||||
SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
dh::safe_cuda(cudaSetDevice(adapter->DeviceIdx()));
|
||||
auto device =
|
||||
adapter->DeviceIdx() < 0 ? dh::CurrentDevice() : adapter->DeviceIdx();
|
||||
CHECK_GE(device, 0);
|
||||
dh::safe_cuda(cudaSetDevice(device));
|
||||
|
||||
CHECK(adapter->NumRows() != kAdapterUnknownSize);
|
||||
CHECK(adapter->NumColumns() != kAdapterUnknownSize);
|
||||
@@ -27,8 +30,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
|
||||
// Enforce single batch
|
||||
CHECK(!adapter->Next());
|
||||
|
||||
info_.num_nonzero_ = CopyToSparsePage(adapter->Value(), adapter->DeviceIdx(),
|
||||
missing, sparse_page_.get());
|
||||
info_.num_nonzero_ =
|
||||
CopyToSparsePage(adapter->Value(), device, missing, sparse_page_.get());
|
||||
info_.num_col_ = adapter->NumColumns();
|
||||
info_.num_row_ = adapter->NumRows();
|
||||
// Synchronise worker columns
|
||||
|
||||
@@ -15,6 +15,29 @@ MetaInfo &SparsePageDMatrix::Info() { return info_; }
|
||||
|
||||
const MetaInfo &SparsePageDMatrix::Info() const { return info_; }
|
||||
|
||||
namespace detail {
|
||||
// Use device dispatch
|
||||
size_t NSamplesDevice(DMatrixProxy *proxy)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
common::AssertGPUSupport();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
size_t NFeaturesDevice(DMatrixProxy *proxy)
|
||||
#if defined(XGBOOST_USE_CUDA)
|
||||
; // NOLINT
|
||||
#else
|
||||
{
|
||||
common::AssertGPUSupport();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
} // namespace detail
|
||||
|
||||
|
||||
SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy_handle,
|
||||
DataIterResetCallback *reset,
|
||||
XGDMatrixCallbackNext *next, float missing,
|
||||
@@ -35,13 +58,24 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
|
||||
size_t nnz = 0;
|
||||
|
||||
auto num_rows = [&]() {
|
||||
return HostAdapterDispatch(
|
||||
proxy, [](auto const &value) { return value.NumRows(); });
|
||||
bool type_error {false};
|
||||
size_t n_samples = HostAdapterDispatch(
|
||||
proxy, [](auto const &value) { return value.NumRows(); }, &type_error);
|
||||
if (type_error) {
|
||||
n_samples = detail::NSamplesDevice(proxy);
|
||||
}
|
||||
return n_samples;
|
||||
};
|
||||
auto num_cols = [&]() {
|
||||
return HostAdapterDispatch(
|
||||
proxy, [](auto const &value) { return value.NumCols(); });
|
||||
bool type_error {false};
|
||||
size_t n_features = HostAdapterDispatch(
|
||||
proxy, [](auto const &value) { return value.NumCols(); }, &type_error);
|
||||
if (type_error) {
|
||||
n_features = detail::NFeaturesDevice(proxy);
|
||||
}
|
||||
return n_features;
|
||||
};
|
||||
|
||||
// the proxy is iterated together with the sparse page source so we can obtain all
|
||||
// information in 1 pass.
|
||||
for (auto const &page : this->GetRowBatchesImpl()) {
|
||||
|
||||
@@ -7,8 +7,24 @@
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
|
||||
namespace detail {
|
||||
size_t NSamplesDevice(DMatrixProxy *proxy) {
|
||||
return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
|
||||
}
|
||||
|
||||
size_t NFeaturesDevice(DMatrixProxy *proxy) {
|
||||
return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
|
||||
auto device = proxy->DeviceIdx();
|
||||
if (device < 0) {
|
||||
device = dh::CurrentDevice();
|
||||
}
|
||||
CHECK_GE(device, 0);
|
||||
|
||||
Dispatch(proxy, [&](auto const &value) {
|
||||
CopyToSparsePage(value, device, missing, page);
|
||||
});
|
||||
|
||||
@@ -236,7 +236,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
|
||||
iter_{iter}, proxy_{proxy} {
|
||||
if (!cache_info_->written) {
|
||||
iter_.Reset();
|
||||
iter_.Next();
|
||||
CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
|
||||
}
|
||||
this->Fetch();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user