Add native support for Dask (#4473)
* Add native support for Dask * Add multi-GPU demo * Add sklearn example
This commit is contained in:
20
demo/dask/README.md
Normal file
20
demo/dask/README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Dask Integration
|
||||
|
||||
[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
|
||||
|
||||
The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
|
||||
|
||||
The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
|
||||
|
||||
## Requirements
|
||||
Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
|
||||
|
||||
The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
|
||||
|
||||
Install via `pip install gputil`
|
||||
|
||||
## Running the scripts
|
||||
```bash
|
||||
python dask_simple_demo.py
|
||||
python dask_gpu_demo.py
|
||||
```
|
||||
42
demo/dask/dask_gpu_demo.py
Normal file
42
demo/dask/dask_gpu_demo.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from dask.distributed import Client, LocalCluster
|
||||
import dask.dataframe as dd
|
||||
import dask.array as da
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import GPUtil
|
||||
import time
|
||||
|
||||
|
||||
# Define the function to be executed on each worker
|
||||
def train(X, y, available_devices):
|
||||
dtrain = xgb.dask.create_worker_dmatrix(X, y)
|
||||
local_device = available_devices[xgb.rabit.get_rank()]
|
||||
# Specify the GPU algorithm and device for this worker
|
||||
params = {"tree_method": "gpu_hist", "gpu_id": local_device}
|
||||
print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
|
||||
start = time.time()
|
||||
xgb.train(params, dtrain, num_boost_round=500)
|
||||
end = time.time()
|
||||
print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
|
||||
|
||||
|
||||
def main():
|
||||
max_devices = 16
|
||||
# Check which devices we have locally
|
||||
available_devices = GPUtil.getAvailable(limit=max_devices)
|
||||
# Use one worker per device
|
||||
cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
|
||||
client = Client(cluster)
|
||||
|
||||
# Set up a relatively large regression problem
|
||||
n = 100
|
||||
m = 10000000
|
||||
partition_size = 100000
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
xgb.dask.run(client, train, X, y, available_devices)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
68
demo/dask/dask_simple_demo.py
Normal file
68
demo/dask/dask_simple_demo.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from dask.distributed import Client, LocalCluster
|
||||
import dask.dataframe as dd
|
||||
import dask.array as da
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
# Define the function to be executed on each worker
|
||||
def train(X, y):
|
||||
print("Start training with worker #{}".format(xgb.rabit.get_rank()))
|
||||
# X,y are dask objects distributed across the cluster.
|
||||
# We must obtain the data local to this worker and convert it to DMatrix for training.
|
||||
# xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
|
||||
# (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
|
||||
# this worker
|
||||
dtrain = xgb.dask.create_worker_dmatrix(X, y)
|
||||
|
||||
# Train on the data. Each worker will communicate and synchronise during training. The output
|
||||
# model is expected to be identical on each worker.
|
||||
bst = xgb.train({}, dtrain)
|
||||
# Make predictions on local data
|
||||
pred = bst.predict(dtrain)
|
||||
print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
|
||||
# Get text representation of the model
|
||||
return bst.get_dump()
|
||||
|
||||
|
||||
def train_with_sklearn(X, y):
|
||||
print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
|
||||
X_local = xgb.dask.get_local_data(X)
|
||||
y_local = xgb.dask.get_local_data(y)
|
||||
model = xgb.XGBRegressor(n_estimators=10)
|
||||
model.fit(X_local, y_local)
|
||||
print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
|
||||
return model.predict(X_local)
|
||||
|
||||
|
||||
def main():
|
||||
# Launch a very simple local cluster using two distributed workers with two CPU threads each
|
||||
cluster = LocalCluster(n_workers=2, threads_per_worker=2)
|
||||
client = Client(cluster)
|
||||
|
||||
# Generate some small test data as a dask array
|
||||
# These data frames are internally split into partitions of 20 rows each and then distributed
|
||||
# among workers, so we will have 5 partitions distributed among 2 workers
|
||||
# Note that the partition size MUST be consistent across different dask dataframes/arrays
|
||||
n = 10
|
||||
m = 100
|
||||
partition_size = 20
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
# xgb.dask.run launches an arbitrary function and its arguments on the cluster
|
||||
# Here train(X, y) will be called on each worker
|
||||
# This function blocks until all work is complete
|
||||
models = xgb.dask.run(client, train, X, y)
|
||||
|
||||
# models contains a dictionary mapping workers to results
|
||||
# We expect that the models are the same over all workers
|
||||
first_model = next(iter(models.values()))
|
||||
assert all(model == first_model for worker, model in models.items())
|
||||
|
||||
# We can also train using the sklearn API
|
||||
results = xgb.dask.run(client, train_with_sklearn, X, y)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user