This aligns dask with pyspark, users need to explicitly call: ``` from xgboost.dask import DaskXGBClassifier from xgboost import dask as dxgb ``` In future releases, we might stop using the default import and remove the lazy loader.
41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
"""
|
|
Use scikit-learn regressor interface with CPU histogram tree method
|
|
===================================================================
|
|
"""
|
|
from dask import array as da
|
|
from dask.distributed import Client, LocalCluster
|
|
|
|
from xgboost import dask as dxgb
|
|
|
|
|
|
def main(client):
|
|
# generate some random data for demonstration
|
|
n = 100
|
|
m = 10000
|
|
partition_size = 100
|
|
X = da.random.random((m, n), partition_size)
|
|
y = da.random.random(m, partition_size)
|
|
|
|
regressor = dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
|
|
regressor.set_params(tree_method="hist")
|
|
# assigning client here is optional
|
|
regressor.client = client
|
|
|
|
regressor.fit(X, y, eval_set=[(X, y)])
|
|
prediction = regressor.predict(X)
|
|
|
|
bst = regressor.get_booster()
|
|
history = regressor.evals_result()
|
|
|
|
print("Evaluation history:", history)
|
|
# returned prediction is always a dask array.
|
|
assert isinstance(prediction, da.Array)
|
|
return bst # returning the trained model
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# or use other clusters for scaling
|
|
with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
|
|
with Client(cluster) as client:
|
|
main(client)
|