Accept iterator in device dmatrix. (#5783)

* Remove Device DMatrix.
2020-07-07 21:44:48 +08:00
parent 048d969be4
commit a3ec964346
12 changed files with 495 additions and 343 deletions
--- a/demo/guide-python/data_iterator.py
+++ b/demo/guide-python/data_iterator.py
@@ -0,0 +1,109 @@
+'''A demo for defining data iterator.
+
+The demo that defines a customized iterator for passing batches of data into
+`xgboost.DeviceQuantileDMatrix` and use this `DeviceQuantileDMatrix` for
+training.  The feature is used primarily designed to reduce the required GPU
+memory for training on distributed environment.
+
+Aftering going through the demo, one might ask why don't we use more native
+Python iterator?  That's because XGBoost requires a `reset` function, while
+using `itertools.tee` might incur significant memory usage according to:
+
+  https://docs.python.org/3/library/itertools.html#itertools.tee.
+
+'''
+
+import xgboost
+import cupy
+import numpy
+
+COLS = 64
+ROWS_PER_BATCH = 1000            # data is splited by rows
+BATCHES = 32
+
+
+class IterForDMatrixDemo(xgboost.core.DataIter):
+    '''A data iterator for XGBoost DMatrix.
+
+    `reset` and `next` are required for any data iterator, other functions here
+    are utilites for demonstration's purpose.
+
+    '''
+    def __init__(self):
+        '''Generate some random data for demostration.
+
+        Actual data can be anything that is currently supported by XGBoost.
+        '''
+        self.rows = ROWS_PER_BATCH
+        self.cols = COLS
+        rng = cupy.random.RandomState(1994)
+        self._data = [rng.randn(self.rows, self.cols)] * BATCHES
+        self._labels = [rng.randn(self.rows)] * BATCHES
+        self._weights = [rng.randn(self.rows)] * BATCHES
+
+        self.it = 0             # set iterator to 0
+        super().__init__()
+
+    def as_array(self):
+        return cupy.concatenate(self._data)
+
+    def as_array_labels(self):
+        return cupy.concatenate(self._labels)
+
+    def as_array_weights(self):
+        return cupy.concatenate(self._weights)
+
+    def data(self):
+        '''Utility function for obtaining current batch of data.'''
+        return self._data[self.it]
+
+    def labels(self):
+        '''Utility function for obtaining current batch of label.'''
+        return self._labels[self.it]
+
+    def weights(self):
+        return self._weights[self.it]
+
+    def reset(self):
+        '''Reset the iterator'''
+        self.it = 0
+
+    def next(self, input_data):
+        '''Yield next batch of data.'''
+        if self.it == len(self._data):
+            # Return 0 when there's no more batch.
+            return 0
+        input_data(data=self.data(), label=self.labels(),
+                   weight=self.weights())
+        self.it += 1
+        return 1
+
+
+def main():
+    rounds = 100
+    it = IterForDMatrixDemo()
+
+    # Use iterator, must be `DeviceQuantileDMatrix`
+    m_with_it = xgboost.DeviceQuantileDMatrix(it)
+
+    # Use regular DMatrix.
+    m = xgboost.DMatrix(it.as_array(), it.as_array_labels(),
+                        weight=it.as_array_weights())
+
+    assert m_with_it.num_col() == m.num_col()
+    assert m_with_it.num_row() == m.num_row()
+
+    reg_with_it = xgboost.train({'tree_method': 'gpu_hist'}, m_with_it,
+                                num_boost_round=rounds)
+    predict_with_it = reg_with_it.predict(m_with_it)
+
+    reg = xgboost.train({'tree_method': 'gpu_hist'}, m,
+                        num_boost_round=rounds)
+    predict = reg.predict(m)
+
+    numpy.testing.assert_allclose(predict_with_it, predict,
+                                  rtol=1e6)
+
+
+if __name__ == '__main__':
+    main()