- training with external memory - part 2 of 2 (#4526)

* - training with external memory - part 2 of 2 - when external memory support is enabled, building of histogram indices are done incrementally for every sparse page - the entire set of input data is divided across multiple gpu's and the relative row positions within each device is tracked when building the compressed histogram buffer - this was tested using a mortgage dataset containing ~ 670m rows before 4xt4's could be saturated
2019-06-11 14:52:56 -07:00
parent 4591039eba
commit a2042b685a
4 changed files with 292 additions and 61 deletions
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -165,6 +165,27 @@ std::shared_ptr<xgboost::DMatrix> *CreateDMatrix(int rows, int columns,

 std::unique_ptr<DMatrix> CreateSparsePageDMatrix(size_t n_entries, size_t page_size);

+/**
+ * \fn std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
+ *                                                            size_t page_size);
+ *
+ * \brief Creates dmatrix with some records, each record containing random number of
+ *        features in [1, n_cols]
+ *
+ * \param n_rows      Number of records to create.
+ * \param n_cols      Max number of features within that record.
+ * \param page_size   Sparse page size for the pages within the dmatrix. If page size is 0
+ *                    then the entire dmatrix is resident in memory; else, multiple sparse pages
+ *                    of page size are created and backed to disk, which would have to be
+ *                    streamed in at point of use.
+ * \param deterministic The content inside the dmatrix is constant for this configuration, if true;
+ *                      else, the content changes every time this method is invoked
+ *
+ * \return The new dmatrix.
+ */
+std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(size_t n_rows, size_t n_cols,
+                                                       size_t page_size, bool deterministic);
+
 gbm::GBTreeModel CreateTestModel();

 inline LearnerTrainParam CreateEmptyGenericParam(int gpu_id, int n_gpus) {