Avoid default tokenization in Dask (#10398)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
Richard (Rick) Zamora 2024-06-14 06:44:54 -05:00 committed by GitHub
parent 01ff2b2c29
commit dc14f98f40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 16 additions and 9 deletions

View File

@ -7,6 +7,7 @@ import json
import os import os
import re import re
import sys import sys
import uuid
import warnings import warnings
import weakref import weakref
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
@ -3143,3 +3144,9 @@ class Booster:
UserWarning, UserWarning,
) )
return nph_stacked return nph_stacked
def __dask_tokenize__(self) -> uuid.UUID:
# TODO: Implement proper tokenization to avoid unnecessary re-computation in
# Dask. However, default tokenzation causes problems after
# https://github.com/dask/dask/pull/10883
return uuid.uuid4()

View File

@ -25,7 +25,7 @@ RUN \
mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \ mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \ python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
"nccl>=${NCCL_SHORT_VER}" \ "nccl>=${NCCL_SHORT_VER}" \
dask=2024.1.1 \ dask \
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \ dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
"pyspark>=3.4.0" cloudpickle cuda-python && \ "pyspark>=3.4.0" cloudpickle cuda-python && \

View File

@ -28,7 +28,7 @@ RUN \
mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \ mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \ python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
"nccl>=${NCCL_SHORT_VER}" \ "nccl>=${NCCL_SHORT_VER}" \
dask=2024.1.1 \ dask \
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \ "dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \ numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
"pyspark>=3.4.0" cloudpickle cuda-python && \ "pyspark>=3.4.0" cloudpickle cuda-python && \

View File

@ -17,8 +17,8 @@ dependencies:
- scikit-learn - scikit-learn
- pandas - pandas
- matplotlib - matplotlib
- dask>=2022.6 - dask
- distributed>=2022.6 - distributed
- python-graphviz - python-graphviz
- hypothesis>=6.46 - hypothesis>=6.46
- astroid - astroid

View File

@ -248,10 +248,10 @@ class TestDistributedGPU:
import dask_cudf import dask_cudf
X, y = make_categorical(local_cuda_client, 10000, 30, 13) X, y = make_categorical(local_cuda_client, 10000, 30, 13)
X = dask_cudf.from_dask_dataframe(X) X = X.to_backend("cudf")
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True) X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
X_onehot = dask_cudf.from_dask_dataframe(X_onehot) X_onehot = X_onehot.to_backend("cudf")
run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y) run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
@given( @given(
@ -383,9 +383,9 @@ class TestDistributedGPU:
X_, y_, w_ = generate_array(with_weights=True) X_, y_, w_ = generate_array(with_weights=True)
y_ = (y_ * 10).astype(np.int32) y_ = (y_ * 10).astype(np.int32)
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_)) X = dd.from_dask_array(X_).to_backend("cudf")
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_)) y = dd.from_dask_array(y_).to_backend("cudf")
w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_)) w = dd.from_dask_array(w_).to_backend("cudf")
run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10) run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
def test_empty_dmatrix(self, local_cuda_client: Client) -> None: def test_empty_dmatrix(self, local_cuda_client: Client) -> None: