Avoid default tokenization in Dask (#10398)
--------- Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
This commit is contained in:
parent
01ff2b2c29
commit
dc14f98f40
@ -7,6 +7,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
import weakref
|
import weakref
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
@ -3143,3 +3144,9 @@ class Booster:
|
|||||||
UserWarning,
|
UserWarning,
|
||||||
)
|
)
|
||||||
return nph_stacked
|
return nph_stacked
|
||||||
|
|
||||||
|
def __dask_tokenize__(self) -> uuid.UUID:
|
||||||
|
# TODO: Implement proper tokenization to avoid unnecessary re-computation in
|
||||||
|
# Dask. However, default tokenzation causes problems after
|
||||||
|
# https://github.com/dask/dask/pull/10883
|
||||||
|
return uuid.uuid4()
|
||||||
|
|||||||
@ -25,7 +25,7 @@ RUN \
|
|||||||
mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
|
mamba create -y -n gpu_test -c rapidsai -c conda-forge -c nvidia \
|
||||||
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
|
||||||
"nccl>=${NCCL_SHORT_VER}" \
|
"nccl>=${NCCL_SHORT_VER}" \
|
||||||
dask=2024.1.1 \
|
dask \
|
||||||
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
|
||||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||||
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
||||||
|
|||||||
@ -28,7 +28,7 @@ RUN \
|
|||||||
mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
|
mamba create -y -n gpu_test -c rapidsai-nightly -c conda-forge -c nvidia \
|
||||||
python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
|
python=3.10 "cudf=$RAPIDS_VERSION_ARG.*" "rmm=$RAPIDS_VERSION_ARG.*" cudatoolkit=$CUDA_VERSION_ARG \
|
||||||
"nccl>=${NCCL_SHORT_VER}" \
|
"nccl>=${NCCL_SHORT_VER}" \
|
||||||
dask=2024.1.1 \
|
dask \
|
||||||
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
|
"dask-cuda=$RAPIDS_VERSION_ARG.*" "dask-cudf=$RAPIDS_VERSION_ARG.*" cupy \
|
||||||
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
|
||||||
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
"pyspark>=3.4.0" cloudpickle cuda-python && \
|
||||||
|
|||||||
@ -17,8 +17,8 @@ dependencies:
|
|||||||
- scikit-learn
|
- scikit-learn
|
||||||
- pandas
|
- pandas
|
||||||
- matplotlib
|
- matplotlib
|
||||||
- dask>=2022.6
|
- dask
|
||||||
- distributed>=2022.6
|
- distributed
|
||||||
- python-graphviz
|
- python-graphviz
|
||||||
- hypothesis>=6.46
|
- hypothesis>=6.46
|
||||||
- astroid
|
- astroid
|
||||||
|
|||||||
@ -248,10 +248,10 @@ class TestDistributedGPU:
|
|||||||
import dask_cudf
|
import dask_cudf
|
||||||
|
|
||||||
X, y = make_categorical(local_cuda_client, 10000, 30, 13)
|
X, y = make_categorical(local_cuda_client, 10000, 30, 13)
|
||||||
X = dask_cudf.from_dask_dataframe(X)
|
X = X.to_backend("cudf")
|
||||||
|
|
||||||
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
|
X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
|
||||||
X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
|
X_onehot = X_onehot.to_backend("cudf")
|
||||||
run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
|
run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
|
||||||
|
|
||||||
@given(
|
@given(
|
||||||
@ -383,9 +383,9 @@ class TestDistributedGPU:
|
|||||||
|
|
||||||
X_, y_, w_ = generate_array(with_weights=True)
|
X_, y_, w_ = generate_array(with_weights=True)
|
||||||
y_ = (y_ * 10).astype(np.int32)
|
y_ = (y_ * 10).astype(np.int32)
|
||||||
X = dask_cudf.from_dask_dataframe(dd.from_dask_array(X_))
|
X = dd.from_dask_array(X_).to_backend("cudf")
|
||||||
y = dask_cudf.from_dask_dataframe(dd.from_dask_array(y_))
|
y = dd.from_dask_array(y_).to_backend("cudf")
|
||||||
w = dask_cudf.from_dask_dataframe(dd.from_dask_array(w_))
|
w = dd.from_dask_array(w_).to_backend("cudf")
|
||||||
run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
|
run_dask_classifier(X, y, w, model, "hist", "cuda", local_cuda_client, 10)
|
||||||
|
|
||||||
def test_empty_dmatrix(self, local_cuda_client: Client) -> None:
|
def test_empty_dmatrix(self, local_cuda_client: Client) -> None:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user