Rewrite Dask interface. (#4819)

2019-09-25 01:30:14 -04:00
parent 562bb0ae31
commit b8433c455a
17 changed files with 1002 additions and 361 deletions
--- a/demo/dask/README.md
+++ b/demo/dask/README.md
@@ -1,20 +0,0 @@
-# Dask Integration
-
-[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
-
-The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
-
-The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
-
-## Requirements
-Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
-
-The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
-
-Install via `pip install gputil` 
-
-## Running the scripts
-```bash
-python dask_simple_demo.py
-python dask_gpu_demo.py
-```
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -0,0 +1,35 @@
+import xgboost as xgb
+from xgboost.dask import DaskDMatrix
+from dask.distributed import Client
+from dask.distributed import LocalCluster
+from dask import array as da
+
+
+def main(client):
+    n = 100
+    m = 100000
+    partition_size = 1000
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    dtrain = DaskDMatrix(client, X, y)
+
+    output = xgb.dask.train(client,
+                            {'verbosity': 2,
+                             'nthread': 1,
+                             'tree_method': 'hist'},
+                            dtrain,
+                            num_boost_round=4, evals=[(dtrain, 'train')])
+    bst = output['booster']
+    history = output['history']
+
+    prediction = xgb.dask.predict(client, bst, dtrain)
+    print('Evaluation history:', history)
+    return prediction
+
+
+if __name__ == '__main__':
+    # or use any other clusters
+    cluster = LocalCluster(n_workers=4, threads_per_worker=1)
+    client = Client(cluster)
+    main(client)
--- a/demo/dask/dask_gpu_demo.py
+++ b/demo/dask/dask_gpu_demo.py
@@ -1,42 +0,0 @@
-from dask.distributed import Client, LocalCluster
-import dask.dataframe as dd
-import dask.array as da
-import numpy as np
-import xgboost as xgb
-import GPUtil
-import time
-
-
-# Define the function to be executed on each worker
-def train(X, y, available_devices):
-    dtrain = xgb.dask.create_worker_dmatrix(X, y)
-    local_device = available_devices[xgb.rabit.get_rank()]
-    # Specify the GPU algorithm and device for this worker
-    params = {"tree_method": "gpu_hist", "gpu_id": local_device}
-    print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
-    start = time.time()
-    xgb.train(params, dtrain, num_boost_round=500)
-    end = time.time()
-    print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
-
-
-def main():
-    max_devices = 16
-    # Check which devices we have locally
-    available_devices = GPUtil.getAvailable(limit=max_devices)
-    # Use one worker per device
-    cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
-    client = Client(cluster)
-
-    # Set up a relatively large regression problem
-    n = 100
-    m = 10000000
-    partition_size = 100000
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
-
-    xgb.dask.run(client, train, X, y, available_devices)
-
-
-if __name__ == '__main__':
-    main()
--- a/demo/dask/dask_simple_demo.py
+++ b/demo/dask/dask_simple_demo.py
@@ -1,68 +0,0 @@
-from dask.distributed import Client, LocalCluster
-import dask.dataframe as dd
-import dask.array as da
-import numpy as np
-import xgboost as xgb
-
-
-# Define the function to be executed on each worker
-def train(X, y):
-    print("Start training with worker #{}".format(xgb.rabit.get_rank()))
-    # X,y are dask objects distributed across the cluster.
-    # We must obtain the data local to this worker and convert it to DMatrix for training.
-    # xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
-    # (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
-    # this worker
-    dtrain = xgb.dask.create_worker_dmatrix(X, y)
-
-    # Train on the data. Each worker will communicate and synchronise during training. The output
-    #  model is expected to be identical on each worker.
-    bst = xgb.train({}, dtrain)
-    # Make predictions on local data
-    pred = bst.predict(dtrain)
-    print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
-    # Get text representation of the model
-    return bst.get_dump()
-
-
-def train_with_sklearn(X, y):
-    print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
-    X_local = xgb.dask.get_local_data(X)
-    y_local = xgb.dask.get_local_data(y)
-    model = xgb.XGBRegressor(n_estimators=10)
-    model.fit(X_local, y_local)
-    print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
-    return model.predict(X_local)
-
-
-def main():
-    # Launch a very simple local cluster using two distributed workers with two CPU threads each
-    cluster = LocalCluster(n_workers=2, threads_per_worker=2)
-    client = Client(cluster)
-
-    # Generate some small test data as a dask array
-    # These data frames are internally split into partitions of 20 rows each and then distributed
-    #  among workers, so we will have 5 partitions distributed among 2 workers
-    # Note that the partition size MUST be consistent across different dask dataframes/arrays
-    n = 10
-    m = 100
-    partition_size = 20
-    X = da.random.random((m, n), partition_size)
-    y = da.random.random(m, partition_size)
-
-    # xgb.dask.run launches an arbitrary function and its arguments on the cluster
-    # Here train(X, y) will be called on each worker
-    # This function blocks until all work is complete
-    models = xgb.dask.run(client, train, X, y)
-
-    # models contains a dictionary mapping workers to results
-    # We expect that the models are the same over all workers
-    first_model = next(iter(models.values()))
-    assert all(model == first_model for worker, model in models.items())
-
-    # We can also train using the sklearn API
-    results = xgb.dask.run(client, train_with_sklearn, X, y)
-
-
-if __name__ == '__main__':
-    main()
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -0,0 +1,41 @@
+from dask_cuda import LocalCUDACluster
+from dask.distributed import Client
+from dask import array as da
+import xgboost as xgb
+from xgboost.dask import DaskDMatrix
+
+
+def main(client):
+    n = 100
+    m = 100000
+    partition_size = 1000
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
+    # DMatrix scatter around workers.
+    dtrain = DaskDMatrix(client, X, y)
+
+    # Use train method from xgboost.dask instead of xgboost.  This
+    # distributed version of train returns a dictionary containing the
+    # resulting booster and evaluation history obtained from
+    # evaluation metrics.
+    output = xgb.dask.train(client,
+                            {'verbosity': 2,
+                             'nthread': 1,
+                             'tree_method': 'gpu_hist'},
+                            dtrain,
+                            num_boost_round=4, evals=[(dtrain, 'train')])
+    bst = output['booster']
+    history = output['history']
+
+    prediction = xgb.dask.predict(client, bst, dtrain)
+    print('Evaluation history:', history)
+    return prediction
+
+
+if __name__ == '__main__':
+    # or use any other clusters
+    cluster = LocalCUDACluster(n_workers=4, threads_per_worker=1)
+    client = Client(cluster)
+    main(client)
--- a/demo/dask/sklearn_cpu_training.py
+++ b/demo/dask/sklearn_cpu_training.py
@@ -0,0 +1,30 @@
+'''Dask interface demo:
+
+Use scikit-learn regressor interface with CPU histogram tree method.'''
+from dask.distributed import Client
+from dask.distributed import LocalCluster
+from dask import array as da
+import xgboost
+
+if __name__ == '__main__':
+    cluster = LocalCluster(n_workers=2, silence_logs=False)  # or use any other clusters
+    client = Client(cluster)
+
+    n = 100
+    m = 10000
+    partition_size = 100
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    regressor = xgboost.dask.DaskXGBRegressor(verbosity=2, n_estimators=2)
+    regressor.set_params(tree_method='hist')
+    regressor.client = client
+
+    regressor.fit(X, y, eval_set=[(X, y)])
+    prediction = regressor.predict(X)
+
+    bst = regressor.get_booster()
+    history = regressor.evals_result()
+
+    print('Evaluation history:', history)
+    assert isinstance(prediction, da.Array)
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -0,0 +1,31 @@
+'''Dask interface demo:
+
+Use scikit-learn regressor interface with GPU histogram tree method.'''
+
+from dask.distributed import Client
+# It's recommended to use dask_cuda for GPU assignment
+from dask_cuda import LocalCUDACluster
+from dask import array as da
+import xgboost
+
+if __name__ == '__main__':
+    cluster = LocalCUDACluster()
+    client = Client(cluster)
+
+    n = 100
+    m = 1000000
+    partition_size = 10000
+    X = da.random.random((m, n), partition_size)
+    y = da.random.random(m, partition_size)
+
+    regressor = xgboost.dask.DaskXGBRegressor(verbosity=2)
+    regressor.set_params(tree_method='gpu_hist')
+    regressor.client = client
+
+    regressor.fit(X, y, eval_set=[(X, y)])
+    prediction = regressor.predict(X)
+
+    bst = regressor.get_booster()
+    history = regressor.evals_result()
+
+    print('Evaluation history:', history)