diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst index 6ef42c067..ad6ec3659 100644 --- a/doc/python/python_api.rst +++ b/doc/python/python_api.rst @@ -82,6 +82,8 @@ Dask API .. autofunction:: xgboost.dask.DaskDMatrix +.. autofunction:: xgboost.dask.train + .. autofunction:: xgboost.dask.predict .. autofunction:: xgboost.dask.DaskXGBClassifier diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst index d5079b403..94167d487 100644 --- a/doc/tutorials/dask.rst +++ b/doc/tutorials/dask.rst @@ -77,6 +77,27 @@ interface with ``DaskXGBClassifier`` and ``DaskXGBRegressor``. See ``xgboost/de for more examples. +***************************************************************************** +Why is the initialization of ``DaskDMatrix`` so slow and throws weird errors +***************************************************************************** + +The dask API in XGBoost requires construction of ``DaskDMatrix``. With ``Scikit-Learn`` +interface, ``DaskDMatrix`` is implicitly constructed for each input data during `fit` or +`predict`. You might have observed its construction is taking incredible amount of time, +and sometimes throws error that doesn't seem to be relevant to `DaskDMatrix`. Here is a +brief explanation for why. By default most of dask's computation is `lazy +`_, which +means the computation is not carried out until you explicitly ask for result, either by +calling `compute()` or `wait()`. See above link for details in dask, and `this wiki +`_ for general concept of lazy evaluation. +The `DaskDMatrix` constructor forces all lazy computation to materialize, which means it's +where all your earlier computation actually being carried out, including operations like +`dd.read_csv()`. To isolate the computation in `DaskDMatrix` from other lazy +computations, one can explicitly wait for results of input data before calling constructor +of `DaskDMatrix`. Also dask's `web interface +`_ can be used to monitor what operations +are currently being performed. + *********** Limitations *********** diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index e6e392d3f..b1e0fafdc 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -113,25 +113,28 @@ def _assert_client(client): class DaskDMatrix: # pylint: disable=missing-docstring, too-many-instance-attributes - '''DMatrix holding on references to Dask DataFrame or Dask Array. + '''DMatrix holding on references to Dask DataFrame or Dask Array. Constructing + a `DaskDMatrix` forces all lazy computation to be carried out. Wait for + the input data explicitly if you want to see actual computation of + constructing `DaskDMatrix`. - Parameters - ---------- - client: dask.distributed.Client + Parameters + ---------- + client: dask.distributed.Client Specify the dask client used for training. Use default client returned from dask if it's set to None. - data : dask.array.Array/dask.dataframe.DataFrame + data : dask.array.Array/dask.dataframe.DataFrame data source of DMatrix. - label: dask.array.Array/dask.dataframe.DataFrame + label: dask.array.Array/dask.dataframe.DataFrame label used for trainin. - missing : float, optional - Value in the input data (e.g. `numpy.ndarray`) which needs - to be present as a missing value. If None, defaults to np.nan. - weight : dask.array.Array/dask.dataframe.DataFrame + missing : float, optional + Value in the input data (e.g. `numpy.ndarray`) which needs + to be present as a missing value. If None, defaults to np.nan. + weight : dask.array.Array/dask.dataframe.DataFrame Weight for each instance. - feature_names : list, optional + feature_names : list, optional Set names for features. - feature_types : list, optional + feature_types : list, optional Set types for features ''' @@ -349,23 +352,23 @@ def train(client, params, dtrain, *args, evals=(), **kwargs): Parameters ---------- client: dask.distributed.Client - Specify the dask client used for training. Use default client - returned from dask if it's set to None. - - Other parameters are the same as `xgboost.train` except for `evals_result`, - which is returned as part of function return value instead of argument. + Specify the dask client used for training. Use default client + returned from dask if it's set to None. + \\*\\*kwargs: + Other parameters are the same as `xgboost.train` except for `evals_result`, + which is returned as part of function return value instead of argument. Returns ------- results: dict - A dictionary containing trained booster and evaluation history. - `history` field is the same as `eval_result` from `xgboost.train`. + A dictionary containing trained booster and evaluation history. + `history` field is the same as `eval_result` from `xgboost.train`. - .. code-block:: python + .. code-block:: python - {'booster': xgboost.Booster, - 'history': {'train': {'logloss': ['0.48253', '0.35953']}, - 'eval': {'logloss': ['0.480385', '0.357756']}}} + {'booster': xgboost.Booster, + 'history': {'train': {'logloss': ['0.48253', '0.35953']}, + 'eval': {'logloss': ['0.480385', '0.357756']}}} ''' _assert_dask_support() @@ -420,15 +423,15 @@ def train(client, params, dtrain, *args, evals=(), **kwargs): def predict(client, model, data, *args): '''Run prediction with a trained booster. - .. note:: + .. note:: - Only default prediction mode is supported right now. + Only default prediction mode is supported right now. Parameters ---------- client: dask.distributed.Client - Specify the dask client used for training. Use default client - returned from dask if it's set to None. + Specify the dask client used for training. Use default client + returned from dask if it's set to None. model: A Booster or a dictionary returned by `xgboost.dask.train`. The trained model. data: DaskDMatrix diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 5ac8d177d..772951583 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -136,26 +136,26 @@ def to_graphviz(booster, fmap='', num_trees=0, rankdir=None, Edge color when meets the node condition. no_color : str, default '#FF0000' Edge color when doesn't meet the node condition. - condition_node_params : dict (optional) + condition_node_params : dict, optional Condition node configuration for for graphviz. Example: .. code-block:: python - {'shape': 'box', - 'style': 'filled,rounded', - 'fillcolor': '#78bceb'} + {'shape': 'box', + 'style': 'filled,rounded', + 'fillcolor': '#78bceb'} - leaf_node_params : dict (optional) + leaf_node_params : dict, optional Leaf node configuration for graphviz. Example: .. code-block:: python - {'shape': 'box', - 'style': 'filled', - 'fillcolor': '#e48038'} + {'shape': 'box', + 'style': 'filled', + 'fillcolor': '#e48038'} - kwargs : Other keywords passed to graphviz graph_attr, E.g.: - ``graph [ {key} = {value} ]`` + \\*\\*kwargs: dict, optional + Other keywords passed to graphviz graph_attr, e.g. ``graph [ {key} = {value} ]`` Returns -------