diff --git a/include/xgboost/data.h b/include/xgboost/data.h index a0674f96b..57babfafe 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -441,6 +441,13 @@ class DMatrix { DMatrix() = default; /*! \brief meta information of the dataset */ virtual MetaInfo& Info() = 0; + virtual void SetInfo(const char *key, const void *dptr, DataType dtype, + size_t num) { + this->Info().SetInfo(key, dptr, dtype, num); + } + virtual void SetInfo(const char* key, std::string const& interface_str) { + this->Info().SetInfo(key, interface_str); + } /*! \brief meta information of the dataset */ virtual const MetaInfo& Info() const = 0; /** diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu new file mode 100644 index 000000000..088351021 --- /dev/null +++ b/src/data/proxy_dmatrix.cu @@ -0,0 +1,24 @@ +/*! + * Copyright 2020 XGBoost contributors + */ +#include "proxy_dmatrix.h" +#include "device_adapter.cuh" + +namespace xgboost { +namespace data { + +void DMatrixProxy::FromCudaColumnar(std::string interface_str) { + std::shared_ptr adapter {new data::CudfAdapter(interface_str)}; + auto const& value = adapter->Value(); + this->batch_ = adapter; + device_ = adapter->DeviceIdx(); +} + +void DMatrixProxy::FromCudaArray(std::string interface_str) { + std::shared_ptr adapter(new CupyAdapter(interface_str)); + this->batch_ = adapter; + device_ = adapter->DeviceIdx(); +} + +} // namespace data +} // namespace xgboost diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h new file mode 100644 index 000000000..1eb00e130 --- /dev/null +++ b/src/data/proxy_dmatrix.h @@ -0,0 +1,104 @@ +/*! + * Copyright 2020 XGBoost contributors + */ +#ifndef XGBOOST_DATA_PROXY_DMATRIX_H_ +#define XGBOOST_DATA_PROXY_DMATRIX_H_ + +#include + +#include +#include +#include + +#include "xgboost/data.h" +#include "xgboost/generic_parameters.h" +#include "xgboost/c_api.h" +#include "adapter.h" + +namespace xgboost { +namespace data { +/* + * \brief A proxy to external iterator. + */ +template +class DataIterProxy { + DataIterHandle iter_; + ResetFn* reset_; + NextFn* next_; + + public: + DataIterProxy(DataIterHandle iter, ResetFn* reset, NextFn* next) : + iter_{iter}, + reset_{reset}, next_{next} {} + + bool Next() { + return next_(iter_); + } + void Reset() { + reset_(iter_); + } +}; + +/* + * \brief A proxy of DMatrix used by external iterator. + */ +class DMatrixProxy : public DMatrix { + MetaInfo info_; + dmlc::any batch_; + int32_t device_ { xgboost::GenericParameter::kCpuId }; + +#if defined(XGBOOST_USE_CUDA) + void FromCudaColumnar(std::string interface_str); + void FromCudaArray(std::string interface_str); +#endif // defined(XGBOOST_USE_CUDA) + + public: + int DeviceIdx() const { return device_; } + + void SetData(char const* c_interface) { + common::AssertGPUSupport(); +#if defined(XGBOOST_USE_CUDA) + std::string interface_str = c_interface; + Json json_array_interface = + Json::Load({interface_str.c_str(), interface_str.size()}); + if (IsA(json_array_interface)) { + this->FromCudaColumnar(interface_str); + } else { + this->FromCudaArray(interface_str); + } +#endif // defined(XGBOOST_USE_CUDA) + } + + MetaInfo& Info() override { return info_; } + MetaInfo const& Info() const override { return info_; } + bool SingleColBlock() const override { return true; } + bool EllpackExists() const override { return true; } + bool SparsePageExists() const override { return false; } + DMatrix *Slice(common::Span ridxs) override { + LOG(FATAL) << "Slicing DMatrix is not supported for Proxy DMatrix."; + return nullptr; + } + BatchSet GetRowBatches() override { + LOG(FATAL) << "Not implemented."; + return BatchSet(BatchIterator(nullptr)); + } + BatchSet GetColumnBatches() override { + LOG(FATAL) << "Not implemented."; + return BatchSet(BatchIterator(nullptr)); + } + BatchSet GetSortedColumnBatches() override { + LOG(FATAL) << "Not implemented."; + return BatchSet(BatchIterator(nullptr)); + } + BatchSet GetEllpackBatches(const BatchParam& param) override { + LOG(FATAL) << "Not implemented."; + return BatchSet(BatchIterator(nullptr)); + } + + dmlc::any Adapter() const { + return batch_; + } +}; +} // namespace data +} // namespace xgboost +#endif // XGBOOST_DATA_PROXY_DMATRIX_H_ diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu new file mode 100644 index 000000000..5460995d9 --- /dev/null +++ b/tests/cpp/data/test_proxy_dmatrix.cu @@ -0,0 +1,46 @@ +#include +#include +#include +#include "../helpers.h" +#include "../../../src/data/device_adapter.cuh" +#include "../../../src/data/proxy_dmatrix.h" + +namespace xgboost { +namespace data { +TEST(ProxyDMatrix, Basic) { + constexpr size_t kRows{100}, kCols{100}; + HostDeviceVector storage; + auto data = RandomDataGenerator(kRows, kCols, 0.5) + .Device(0) + .GenerateArrayInterface(&storage); + std::vector> label_storage(1); + auto labels = RandomDataGenerator(kRows, 1, 0) + .Device(0) + .GenerateColumnarArrayInterface(&label_storage); + + DMatrixProxy proxy; + proxy.SetData(data.c_str()); + proxy.SetInfo("label", labels.c_str()); + + ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr)); + ASSERT_EQ(proxy.Info().labels_.Size(), kRows); + ASSERT_EQ(dmlc::get>(proxy.Adapter())->NumRows(), + kRows); + ASSERT_EQ( + dmlc::get>(proxy.Adapter())->NumColumns(), + kCols); + + std::vector> columnar_storage(kCols); + data = RandomDataGenerator(kRows, kCols, 0) + .Device(0) + .GenerateColumnarArrayInterface(&columnar_storage); + proxy.SetData(data.c_str()); + ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr)); + ASSERT_EQ(dmlc::get>(proxy.Adapter())->NumRows(), + kRows); + ASSERT_EQ( + dmlc::get>(proxy.Adapter())->NumColumns(), + kCols); +} +} // namespace data +} // namespace xgboost