Rewrite Dask interface. (#4819)

This commit is contained in:
Jiaming Yuan
2019-09-25 01:30:14 -04:00
committed by GitHub
parent 562bb0ae31
commit b8433c455a
17 changed files with 1002 additions and 361 deletions

View File

@@ -1,20 +0,0 @@
# Dask Integration
[Dask](https://dask.org/) is a parallel computing library built on Python. Dask allows easy management of distributed workers and excels handling large distributed data science workflows.
The simple demo shows how to train and make predictions for an xgboost model on a distributed dask environment. We make use of first-class support in xgboost for launching dask workers. Workers launched in this manner are automatically connected via xgboosts underlying communication framework, Rabit. The calls to `xgb.train()` and `xgb.predict()` occur in parallel on each worker and are synchronized.
The GPU demo shows how to configure and use GPUs on the local machine for training on a large dataset.
## Requirements
Dask is trivial to install using either pip or conda. [See here for official install documentation](https://docs.dask.org/en/latest/install.html).
The GPU demo requires [GPUtil](https://github.com/anderskm/gputil) for detecting system GPUs.
Install via `pip install gputil`
## Running the scripts
```bash
python dask_simple_demo.py
python dask_gpu_demo.py
```

35
demo/dask/cpu_training.py Normal file
View File

@@ -0,0 +1,35 @@
import xgboost as xgb
from xgboost.dask import DaskDMatrix
from dask.distributed import Client
from dask.distributed import LocalCluster
from dask import array as da
def main(client):
n = 100
m = 100000
partition_size = 1000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
dtrain = DaskDMatrix(client, X, y)
output = xgb.dask.train(client,
{'verbosity': 2,
'nthread': 1,
'tree_method': 'hist'},
dtrain,
num_boost_round=4, evals=[(dtrain, 'train')])
bst = output['booster']
history = output['history']
prediction = xgb.dask.predict(client, bst, dtrain)
print('Evaluation history:', history)
return prediction
if __name__ == '__main__':
# or use any other clusters
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
client = Client(cluster)
main(client)

View File

@@ -1,42 +0,0 @@
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import dask.array as da
import numpy as np
import xgboost as xgb
import GPUtil
import time
# Define the function to be executed on each worker
def train(X, y, available_devices):
dtrain = xgb.dask.create_worker_dmatrix(X, y)
local_device = available_devices[xgb.rabit.get_rank()]
# Specify the GPU algorithm and device for this worker
params = {"tree_method": "gpu_hist", "gpu_id": local_device}
print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
start = time.time()
xgb.train(params, dtrain, num_boost_round=500)
end = time.time()
print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
def main():
max_devices = 16
# Check which devices we have locally
available_devices = GPUtil.getAvailable(limit=max_devices)
# Use one worker per device
cluster = LocalCluster(n_workers=len(available_devices), threads_per_worker=4)
client = Client(cluster)
# Set up a relatively large regression problem
n = 100
m = 10000000
partition_size = 100000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
xgb.dask.run(client, train, X, y, available_devices)
if __name__ == '__main__':
main()

View File

@@ -1,68 +0,0 @@
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import dask.array as da
import numpy as np
import xgboost as xgb
# Define the function to be executed on each worker
def train(X, y):
print("Start training with worker #{}".format(xgb.rabit.get_rank()))
# X,y are dask objects distributed across the cluster.
# We must obtain the data local to this worker and convert it to DMatrix for training.
# xgb.dask.create_worker_dmatrix follows the API exactly of the standard DMatrix constructor
# (xgb.DMatrix()), except that it 'unpacks' dask distributed objects to obtain data local to
# this worker
dtrain = xgb.dask.create_worker_dmatrix(X, y)
# Train on the data. Each worker will communicate and synchronise during training. The output
# model is expected to be identical on each worker.
bst = xgb.train({}, dtrain)
# Make predictions on local data
pred = bst.predict(dtrain)
print("Finished training with worker #{}".format(xgb.rabit.get_rank()))
# Get text representation of the model
return bst.get_dump()
def train_with_sklearn(X, y):
print("Training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
X_local = xgb.dask.get_local_data(X)
y_local = xgb.dask.get_local_data(y)
model = xgb.XGBRegressor(n_estimators=10)
model.fit(X_local, y_local)
print("Finished training with worker #{} using the sklearn API".format(xgb.rabit.get_rank()))
return model.predict(X_local)
def main():
# Launch a very simple local cluster using two distributed workers with two CPU threads each
cluster = LocalCluster(n_workers=2, threads_per_worker=2)
client = Client(cluster)
# Generate some small test data as a dask array
# These data frames are internally split into partitions of 20 rows each and then distributed
# among workers, so we will have 5 partitions distributed among 2 workers
# Note that the partition size MUST be consistent across different dask dataframes/arrays
n = 10
m = 100
partition_size = 20
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
# xgb.dask.run launches an arbitrary function and its arguments on the cluster
# Here train(X, y) will be called on each worker
# This function blocks until all work is complete
models = xgb.dask.run(client, train, X, y)
# models contains a dictionary mapping workers to results
# We expect that the models are the same over all workers
first_model = next(iter(models.values()))
assert all(model == first_model for worker, model in models.items())
# We can also train using the sklearn API
results = xgb.dask.run(client, train_with_sklearn, X, y)
if __name__ == '__main__':
main()

41
demo/dask/gpu_training.py Normal file
View File

@@ -0,0 +1,41 @@
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask import array as da
import xgboost as xgb
from xgboost.dask import DaskDMatrix
def main(client):
n = 100
m = 100000
partition_size = 1000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
# DaskDMatrix acts like normal DMatrix, works as a proxy for local
# DMatrix scatter around workers.
dtrain = DaskDMatrix(client, X, y)
# Use train method from xgboost.dask instead of xgboost. This
# distributed version of train returns a dictionary containing the
# resulting booster and evaluation history obtained from
# evaluation metrics.
output = xgb.dask.train(client,
{'verbosity': 2,
'nthread': 1,
'tree_method': 'gpu_hist'},
dtrain,
num_boost_round=4, evals=[(dtrain, 'train')])
bst = output['booster']
history = output['history']
prediction = xgb.dask.predict(client, bst, dtrain)
print('Evaluation history:', history)
return prediction
if __name__ == '__main__':
# or use any other clusters
cluster = LocalCUDACluster(n_workers=4, threads_per_worker=1)
client = Client(cluster)
main(client)

View File

@@ -0,0 +1,30 @@
'''Dask interface demo:
Use scikit-learn regressor interface with CPU histogram tree method.'''
from dask.distributed import Client
from dask.distributed import LocalCluster
from dask import array as da
import xgboost
if __name__ == '__main__':
cluster = LocalCluster(n_workers=2, silence_logs=False) # or use any other clusters
client = Client(cluster)
n = 100
m = 10000
partition_size = 100
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
regressor = xgboost.dask.DaskXGBRegressor(verbosity=2, n_estimators=2)
regressor.set_params(tree_method='hist')
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)
bst = regressor.get_booster()
history = regressor.evals_result()
print('Evaluation history:', history)
assert isinstance(prediction, da.Array)

View File

@@ -0,0 +1,31 @@
'''Dask interface demo:
Use scikit-learn regressor interface with GPU histogram tree method.'''
from dask.distributed import Client
# It's recommended to use dask_cuda for GPU assignment
from dask_cuda import LocalCUDACluster
from dask import array as da
import xgboost
if __name__ == '__main__':
cluster = LocalCUDACluster()
client = Client(cluster)
n = 100
m = 1000000
partition_size = 10000
X = da.random.random((m, n), partition_size)
y = da.random.random(m, partition_size)
regressor = xgboost.dask.DaskXGBRegressor(verbosity=2)
regressor.set_params(tree_method='gpu_hist')
regressor.client = client
regressor.fit(X, y, eval_set=[(X, y)])
prediction = regressor.predict(X)
bst = regressor.get_booster()
history = regressor.evals_result()
print('Evaluation history:', history)