Rewrite sparse dmatrix using callbacks. (#7092)

- Reduce dependency on dmlc parsers and provide an interface for users to load data by themselves. - Remove use of threaded iterator and IO queue. - Remove `page_size`. - Make sure the number of pages in memory is bounded. - Make sure the cache can not be violated. - Provide an interface for internal algorithms to process data asynchronously.
2021-07-16 12:33:31 +08:00
parent 2f524e9f41
commit bd1f3a38f0
51 changed files with 1445 additions and 1391 deletions
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -55,7 +55,9 @@ int64_t GetFileSize(const std::string& filename);

 void CreateSimpleTestData(const std::string& filename);

-void CreateBigTestData(const std::string& filename, size_t n_entries);
+// Create a libsvm format file with 3 entries per-row. `zero_based` specifies whether it's
+// 0-based indexing.
+void CreateBigTestData(const std::string& filename, size_t n_entries, bool zero_based = true);

 void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                      std::vector<xgboost::bst_float> preds,
@@ -300,8 +302,7 @@ GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
 std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float> &x,
                                            int num_rows, int num_columns);

-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
-    size_t n_entries, size_t page_size, std::string tmp_file);
+std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, std::string prefix = "cache");

 /**
 * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
@@ -356,7 +357,8 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row

 typedef void *DMatrixHandle;  // NOLINT(*);

-class CudaArrayIterForTest {
+class ArrayIterForTest {
+ protected:
  HostDeviceVector<float> data_;
  size_t iter_ {0};
  DMatrixHandle proxy_;
@@ -373,20 +375,32 @@ class CudaArrayIterForTest {
  size_t static constexpr kBatches { 100 };
  size_t static constexpr kCols { 13 };

-  explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
-                                size_t cols = kCols, size_t batches = kBatches);
-  ~CudaArrayIterForTest();
-
  std::string AsArray() const {
    return interface_;
  }

-  int Next();
-  void Reset() {
+  virtual int Next();
+  virtual void Reset() {
    iter_ = 0;
  }
  size_t Iter() const { return iter_; }
  auto Proxy() -> decltype(proxy_) { return proxy_; }
+
+  explicit ArrayIterForTest(float sparsity, size_t rows = kRows,
+                            size_t cols = kCols, size_t batches = kBatches);
+  virtual ~ArrayIterForTest();
+};
+
+class CudaArrayIterForTest : public ArrayIterForTest {
+ public:
+  size_t static constexpr kRows{1000};
+  size_t static constexpr kBatches{100};
+  size_t static constexpr kCols{13};
+
+  explicit CudaArrayIterForTest(float sparsity, size_t rows = kRows,
+                                size_t cols = kCols, size_t batches = kBatches);
+  int Next() override;
+  ~CudaArrayIterForTest() override = default;
 };

 void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
@@ -396,11 +410,11 @@ void DMatrixToCSR(DMatrix *dmat, std::vector<float> *p_data,
 typedef void *DataIterHandle;  // NOLINT(*)

 inline void Reset(DataIterHandle self) {
-  static_cast<CudaArrayIterForTest*>(self)->Reset();
+  static_cast<ArrayIterForTest*>(self)->Reset();
 }

 inline int Next(DataIterHandle self) {
-  return static_cast<CudaArrayIterForTest*>(self)->Next();
+  return static_cast<ArrayIterForTest*>(self)->Next();
 }

 class RMMAllocator;