Add native support for Dask (#4473)

* Add native support for Dask

* Add multi-GPU demo

* Add sklearn example
This commit is contained in:
Rory Mitchell
2019-05-27 13:29:28 +12:00
committed by GitHub
parent 55e645c5f5
commit 09b90d9329
13 changed files with 407 additions and 16 deletions

20
demo/dask/README.md Normal file
View File

@@ -0,0 +1,20 @@
# Dask Integration
[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
## Requirements
Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
Install via `pip install gputil`
## Running the scripts
```bash
python dask_simple_demo.py
python dask_gpu_demo.py
```

View File

@@ -0,0 +1,42 @@
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import dask.array as da
import numpy as np
import xgboost as xgb
import GPUtil
import time
# Define the function to be executed on each worker
def train(X, y, available_devices):
dtrain = xgb.dask.create_worker_dmatrix(X, y)
local_device = available_devices[xgb.rabit.get_rank()]
# Specify the GPU algorithm and device for this worker
params = {"tree_method": "gpu_hist", "gpu_id": local_device}
print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
start = time.time()
xgb.train(params, dtrain, num_boost_round=500)
end = time.time()
print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
def main():
max_devices = 16
# Check which devices we have locally
available_devices = GPUtil.getAvailable(limit=max_devices)
# Use one worker per device
cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
client = Client(cluster)
# Set up a relatively large regression problem
n = 100
m = 10000000
partition_size = 100000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
xgb.dask.run(client, train, X, y, available_devices)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,68 @@
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import dask.array as da
import numpy as np
import xgboost as xgb
# Define the function to be executed on each worker
def train(X, y):
print("Start training with worker #{}".format(xgb.rabit.get_rank()))
# X,y are dask objects distributed across the cluster.
# We must obtain the data local to this worker and convert it to DMatrix for training.
# xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
# (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
# this worker
dtrain = xgb.dask.create_worker_dmatrix(X, y)
# Train on the data. Each worker will communicate and synchronise during training. The output
# model is expected to be identical on each worker.
bst = xgb.train({}, dtrain)
# Make predictions on local data
pred = bst.predict(dtrain)
print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
# Get text representation of the model
return bst.get_dump()
def train_with_sklearn(X, y):
print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
X_local = xgb.dask.get_local_data(X)
y_local = xgb.dask.get_local_data(y)
model = xgb.XGBRegressor(n_estimators=10)
model.fit(X_local, y_local)
print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
return model.predict(X_local)
def main():
# Launch a very simple local cluster using two distributed workers with two CPU threads each
cluster = LocalCluster(n_workers=2, threads_per_worker=2)
client = Client(cluster)
# Generate some small test data as a dask array
# These data frames are internally split into partitions of 20 rows each and then distributed
# among workers, so we will have 5 partitions distributed among 2 workers
# Note that the partition size MUST be consistent across different dask dataframes/arrays
n = 10
m = 100
partition_size = 20
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
# xgb.dask.run launches an arbitrary function and its arguments on the cluster
# Here train(X, y) will be called on each worker
# This function blocks until all work is complete
models = xgb.dask.run(client, train, X, y)
# models contains a dictionary mapping workers to results
# We expect that the models are the same over all workers
first_model = next(iter(models.values()))
assert all(model == first_model for worker, model in models.items())
# We can also train using the sklearn API
results = xgb.dask.run(client, train_with_sklearn, X, y)
if __name__ == '__main__':
main()