Improve doc and demo for dask. (#4907)

* Add a readme with link to doc. * Add more comments in the demonstrations code. * Workaround https://github.com/dask/distributed/issues/3081 .
2019-09-30 23:59:37 -04:00
parent d30e63a0a5
commit 7e24a8d245
5 changed files with 54 additions and 17 deletions
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -6,16 +6,23 @@ from dask import array as da


 def main(client):
+    # generate some random data for demonstration
    n = 100
    m = 100000
    partition_size = 1000
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

+    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
+    # DMatrix scatter around workers.
    dtrain = DaskDMatrix(client, X, y)

+    # Use train method from xgboost.dask instead of xgboost.  This
+    # distributed version of train returns a dictionary containing the
+    # resulting booster and evaluation history obtained from
+    # evaluation metrics.
    output = xgb.dask.train(client,
-                            {'verbosity': 2,
+                            {'verbosity': 1,
                             'nthread': 1,
                             'tree_method': 'hist'},
                            dtrain,
@@ -23,13 +30,14 @@ def main(client):
    bst = output['booster']
    history = output['history']

+    # you can pass output directly into `predict` too.
    prediction = xgb.dask.predict(client, bst, dtrain)
    print('Evaluation history:', history)
    return prediction


 if __name__ == '__main__':
-    # or use any other clusters
-    cluster = LocalCluster(n_workers=4, threads_per_worker=1)
-    client = Client(cluster)
-    main(client)
+    # or use other clusters for scaling
+    with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
+        with Client(cluster) as client:
+            main(client)