Add native support for Dask (#4473)

* Add native support for Dask * Add multi-GPU demo * Add sklearn example
2019-05-27 13:29:28 +12:00
parent 55e645c5f5
commit 09b90d9329
13 changed files with 407 additions and 16 deletions
--- a/demo/dask/README.md
+++ b/demo/dask/README.md
@@ -0,0 +1,20 @@
+# Dask Integration
+
+[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
+
+The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
+
+The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
+
+## Requirements
+Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
+
+The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
+
+Install via `pip install gputil` 
+
+## Running the scripts
+```bash
+python dask_simple_demo.py
+python dask_gpu_demo.py
+```
--- a/demo/dask/dask_gpu_demo.py
+++ b/demo/dask/dask_gpu_demo.py
@@ -0,0 +1,42 @@
+from dask.distributed import Client, LocalCluster
+import dask.dataframe as dd
+import dask.array as da
+import numpy as np
+import xgboost as xgb
+import GPUtil
+import time
+
+
+# Define the function to be executed on each worker
+def train(X, y, available_devices):
+    dtrain = xgb.dask.create_worker_dmatrix(X, y)
+    local_device = available_devices[xgb.rabit.get_rank()]
+    # Specify the GPU algorithm and device for this worker
+    params = {"tree_method": "gpu_hist", "gpu_id": local_device}
+    print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
+    start = time.time()
+    xgb.train(params, dtrain, num_boost_round=500)
+    end = time.time()
+    print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
+
+
+def main():
+    max_devices = 16
+    # Check which devices we have locally
+    available_devices = GPUtil.getAvailable(limit=max_devices)
+    # Use one worker per device
+    cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
+    client = Client(cluster)
+
+    # Set up a relatively large regression problem
+    n = 100
+    m = 10000000
+    partition_size = 100000
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    xgb.dask.run(client, train, X, y, available_devices)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/dask/dask_simple_demo.py
+++ b/demo/dask/dask_simple_demo.py
@@ -0,0 +1,68 @@
+from dask.distributed import Client, LocalCluster
+import dask.dataframe as dd
+import dask.array as da
+import numpy as np
+import xgboost as xgb
+
+
+# Define the function to be executed on each worker
+def train(X, y):
+    print("Start training with worker #{}".format(xgb.rabit.get_rank()))
+    # X,y are dask objects distributed across the cluster.
+    # We must obtain the data local to this worker and convert it to DMatrix for training.
+    # xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
+    # (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
+    # this worker
+    dtrain = xgb.dask.create_worker_dmatrix(X, y)
+
+    # Train on the data. Each worker will communicate and synchronise during training. The output
+    #  model is expected to be identical on each worker.
+    bst = xgb.train({}, dtrain)
+    # Make predictions on local data
+    pred = bst.predict(dtrain)
+    print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
+    # Get text representation of the model
+    return bst.get_dump()
+
+
+def train_with_sklearn(X, y):
+    print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
+    X_local = xgb.dask.get_local_data(X)
+    y_local = xgb.dask.get_local_data(y)
+    model = xgb.XGBRegressor(n_estimators=10)
+    model.fit(X_local, y_local)
+    print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
+    return model.predict(X_local)
+
+
+def main():
+    # Launch a very simple local cluster using two distributed workers with two CPU threads each
+    cluster = LocalCluster(n_workers=2, threads_per_worker=2)
+    client = Client(cluster)
+
+    # Generate some small test data as a dask array
+    # These data frames are internally split into partitions of 20 rows each and then distributed
+    #  among workers, so we will have 5 partitions distributed among 2 workers
+    # Note that the partition size MUST be consistent across different dask dataframes/arrays
+    n = 10
+    m = 100
+    partition_size = 20
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    # xgb.dask.run launches an arbitrary function and its arguments on the cluster
+    # Here train(X, y) will be called on each worker
+    # This function blocks until all work is complete
+    models = xgb.dask.run(client, train, X, y)
+
+    # models contains a dictionary mapping workers to results
+    # We expect that the models are the same over all workers
+    first_model = next(iter(models.values()))
+    assert all(model == first_model for worker, model in models.items())
+
+    # We can also train using the sklearn API
+    results = xgb.dask.run(client, train_with_sklearn, X, y)
+
+
+if __name__ == '__main__':
+    main()