Rewrite Dask interface. (#4819)
This commit is contained in:
@@ -1,20 +0,0 @@
|
||||
# Dask Integration
|
||||
|
||||
[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
|
||||
|
||||
The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
|
||||
|
||||
The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
|
||||
|
||||
## Requirements
|
||||
Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
|
||||
|
||||
The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
|
||||
|
||||
Install via `pip install gputil`
|
||||
|
||||
## Running the scripts
|
||||
```bash
|
||||
python dask_simple_demo.py
|
||||
python dask_gpu_demo.py
|
||||
```
|
||||
35
demo/dask/cpu_training.py
Normal file
35
demo/dask/cpu_training.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import xgboost as xgb
|
||||
from xgboost.dask import DaskDMatrix
|
||||
from dask.distributed import Client
|
||||
from dask.distributed import LocalCluster
|
||||
from dask import array as da
|
||||
|
||||
|
||||
def main(client):
|
||||
n = 100
|
||||
m = 100000
|
||||
partition_size = 1000
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
dtrain = DaskDMatrix(client, X, y)
|
||||
|
||||
output = xgb.dask.train(client,
|
||||
{'verbosity': 2,
|
||||
'nthread': 1,
|
||||
'tree_method': 'hist'},
|
||||
dtrain,
|
||||
num_boost_round=4, evals=[(dtrain, 'train')])
|
||||
bst = output['booster']
|
||||
history = output['history']
|
||||
|
||||
prediction = xgb.dask.predict(client, bst, dtrain)
|
||||
print('Evaluation history:', history)
|
||||
return prediction
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# or use any other clusters
|
||||
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
|
||||
client = Client(cluster)
|
||||
main(client)
|
||||
@@ -1,42 +0,0 @@
|
||||
from dask.distributed import Client, LocalCluster
|
||||
import dask.dataframe as dd
|
||||
import dask.array as da
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
import GPUtil
|
||||
import time
|
||||
|
||||
|
||||
# Define the function to be executed on each worker
|
||||
def train(X, y, available_devices):
|
||||
dtrain = xgb.dask.create_worker_dmatrix(X, y)
|
||||
local_device = available_devices[xgb.rabit.get_rank()]
|
||||
# Specify the GPU algorithm and device for this worker
|
||||
params = {"tree_method": "gpu_hist", "gpu_id": local_device}
|
||||
print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
|
||||
start = time.time()
|
||||
xgb.train(params, dtrain, num_boost_round=500)
|
||||
end = time.time()
|
||||
print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
|
||||
|
||||
|
||||
def main():
|
||||
max_devices = 16
|
||||
# Check which devices we have locally
|
||||
available_devices = GPUtil.getAvailable(limit=max_devices)
|
||||
# Use one worker per device
|
||||
cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
|
||||
client = Client(cluster)
|
||||
|
||||
# Set up a relatively large regression problem
|
||||
n = 100
|
||||
m = 10000000
|
||||
partition_size = 100000
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
xgb.dask.run(client, train, X, y, available_devices)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,68 +0,0 @@
|
||||
from dask.distributed import Client, LocalCluster
|
||||
import dask.dataframe as dd
|
||||
import dask.array as da
|
||||
import numpy as np
|
||||
import xgboost as xgb
|
||||
|
||||
|
||||
# Define the function to be executed on each worker
|
||||
def train(X, y):
|
||||
print("Start training with worker #{}".format(xgb.rabit.get_rank()))
|
||||
# X,y are dask objects distributed across the cluster.
|
||||
# We must obtain the data local to this worker and convert it to DMatrix for training.
|
||||
# xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
|
||||
# (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
|
||||
# this worker
|
||||
dtrain = xgb.dask.create_worker_dmatrix(X, y)
|
||||
|
||||
# Train on the data. Each worker will communicate and synchronise during training. The output
|
||||
# model is expected to be identical on each worker.
|
||||
bst = xgb.train({}, dtrain)
|
||||
# Make predictions on local data
|
||||
pred = bst.predict(dtrain)
|
||||
print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
|
||||
# Get text representation of the model
|
||||
return bst.get_dump()
|
||||
|
||||
|
||||
def train_with_sklearn(X, y):
|
||||
print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
|
||||
X_local = xgb.dask.get_local_data(X)
|
||||
y_local = xgb.dask.get_local_data(y)
|
||||
model = xgb.XGBRegressor(n_estimators=10)
|
||||
model.fit(X_local, y_local)
|
||||
print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
|
||||
return model.predict(X_local)
|
||||
|
||||
|
||||
def main():
|
||||
# Launch a very simple local cluster using two distributed workers with two CPU threads each
|
||||
cluster = LocalCluster(n_workers=2, threads_per_worker=2)
|
||||
client = Client(cluster)
|
||||
|
||||
# Generate some small test data as a dask array
|
||||
# These data frames are internally split into partitions of 20 rows each and then distributed
|
||||
# among workers, so we will have 5 partitions distributed among 2 workers
|
||||
# Note that the partition size MUST be consistent across different dask dataframes/arrays
|
||||
n = 10
|
||||
m = 100
|
||||
partition_size = 20
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
# xgb.dask.run launches an arbitrary function and its arguments on the cluster
|
||||
# Here train(X, y) will be called on each worker
|
||||
# This function blocks until all work is complete
|
||||
models = xgb.dask.run(client, train, X, y)
|
||||
|
||||
# models contains a dictionary mapping workers to results
|
||||
# We expect that the models are the same over all workers
|
||||
first_model = next(iter(models.values()))
|
||||
assert all(model == first_model for worker, model in models.items())
|
||||
|
||||
# We can also train using the sklearn API
|
||||
results = xgb.dask.run(client, train_with_sklearn, X, y)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
41
demo/dask/gpu_training.py
Normal file
41
demo/dask/gpu_training.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from dask_cuda import LocalCUDACluster
|
||||
from dask.distributed import Client
|
||||
from dask import array as da
|
||||
import xgboost as xgb
|
||||
from xgboost.dask import DaskDMatrix
|
||||
|
||||
|
||||
def main(client):
|
||||
n = 100
|
||||
m = 100000
|
||||
partition_size = 1000
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
# DaskDMatrix acts like normal DMatrix, works as a proxy for local
|
||||
# DMatrix scatter around workers.
|
||||
dtrain = DaskDMatrix(client, X, y)
|
||||
|
||||
# Use train method from xgboost.dask instead of xgboost. This
|
||||
# distributed version of train returns a dictionary containing the
|
||||
# resulting booster and evaluation history obtained from
|
||||
# evaluation metrics.
|
||||
output = xgb.dask.train(client,
|
||||
{'verbosity': 2,
|
||||
'nthread': 1,
|
||||
'tree_method': 'gpu_hist'},
|
||||
dtrain,
|
||||
num_boost_round=4, evals=[(dtrain, 'train')])
|
||||
bst = output['booster']
|
||||
history = output['history']
|
||||
|
||||
prediction = xgb.dask.predict(client, bst, dtrain)
|
||||
print('Evaluation history:', history)
|
||||
return prediction
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# or use any other clusters
|
||||
cluster = LocalCUDACluster(n_workers=4, threads_per_worker=1)
|
||||
client = Client(cluster)
|
||||
main(client)
|
||||
30
demo/dask/sklearn_cpu_training.py
Normal file
30
demo/dask/sklearn_cpu_training.py
Normal file
@@ -0,0 +1,30 @@
|
||||
'''Dask interface demo:
|
||||
|
||||
Use scikit-learn regressor interface with CPU histogram tree method.'''
|
||||
from dask.distributed import Client
|
||||
from dask.distributed import LocalCluster
|
||||
from dask import array as da
|
||||
import xgboost
|
||||
|
||||
if __name__ == '__main__':
|
||||
cluster = LocalCluster(n_workers=2, silence_logs=False) # or use any other clusters
|
||||
client = Client(cluster)
|
||||
|
||||
n = 100
|
||||
m = 10000
|
||||
partition_size = 100
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
regressor = xgboost.dask.DaskXGBRegressor(verbosity=2, n_estimators=2)
|
||||
regressor.set_params(tree_method='hist')
|
||||
regressor.client = client
|
||||
|
||||
regressor.fit(X, y, eval_set=[(X, y)])
|
||||
prediction = regressor.predict(X)
|
||||
|
||||
bst = regressor.get_booster()
|
||||
history = regressor.evals_result()
|
||||
|
||||
print('Evaluation history:', history)
|
||||
assert isinstance(prediction, da.Array)
|
||||
31
demo/dask/sklearn_gpu_training.py
Normal file
31
demo/dask/sklearn_gpu_training.py
Normal file
@@ -0,0 +1,31 @@
|
||||
'''Dask interface demo:
|
||||
|
||||
Use scikit-learn regressor interface with GPU histogram tree method.'''
|
||||
|
||||
from dask.distributed import Client
|
||||
# It's recommended to use dask_cuda for GPU assignment
|
||||
from dask_cuda import LocalCUDACluster
|
||||
from dask import array as da
|
||||
import xgboost
|
||||
|
||||
if __name__ == '__main__':
|
||||
cluster = LocalCUDACluster()
|
||||
client = Client(cluster)
|
||||
|
||||
n = 100
|
||||
m = 1000000
|
||||
partition_size = 10000
|
||||
X = da.random.random((m, n), partition_size)
|
||||
y = da.random.random(m, partition_size)
|
||||
|
||||
regressor = xgboost.dask.DaskXGBRegressor(verbosity=2)
|
||||
regressor.set_params(tree_method='gpu_hist')
|
||||
regressor.client = client
|
||||
|
||||
regressor.fit(X, y, eval_set=[(X, y)])
|
||||
prediction = regressor.predict(X)
|
||||
|
||||
bst = regressor.get_booster()
|
||||
history = regressor.evals_result()
|
||||
|
||||
print('Evaluation history:', history)
|
||||
Reference in New Issue
Block a user