[doc] Document Python inputs. (#8643)

2023-01-10 15:39:32 +08:00
parent 4e12f3e1bc
commit 1b58d81315
4 changed files with 113 additions and 33 deletions
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -32,24 +32,9 @@ To verify your installation, run the following in Python:

 Data Interface
 --------------
-The XGBoost python module is able to load data from many different types of data format,
-including:
+The XGBoost Python module is able to load data from many different types of data format including both CPU and GPU data structures. For a complete list of supported data types, please reference the :ref:`py-data`. For a detailed description of text input formats, please visit :doc:`/tutorials/input_format`.

- NumPy 2D array
- SciPy 2D sparse array
- Pandas data frame
- cuDF DataFrame
- cupy 2D array
- dlpack
- datatable
- XGBoost binary buffer file.
- LIBSVM text format file
- Comma-separated values (CSV) file
- Arrow table.
-
-(See :doc:`/tutorials/input_format` for detailed description of text input format.)
-
-The data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object.
+The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For the sklearn estimator interface, a :py:class:`DMatrix` or a :py:class:`QuantileDMatrix` is created depending on the chosen algorithm and the input, see the sklearn API reference for details. We will illustrate some of the basic input types with the ``DMatrix`` here.

 * To load a NumPy array into :py:class:`DMatrix <xgboost.DMatrix>`:

@@ -120,6 +105,81 @@ to number of groups.
  recommended to use pandas ``read_csv`` or other similar utilites than XGBoost's builtin
  parser.

+.. _py-data:
+
+Supported data structures for various XGBoost functions
+=======================================================
+
+*******
+Markers
+*******
+
+- T: Supported.
+- F: Not supported.
+- NE: Invalid type for the use case. For instance, `pd.Series` can not be multi-target label.
+- NPA: Support with the help of numpy array.
+- CPA: Support with the help of cupy array.
+- SciCSR: Support with the help of scripy sparse CSR. The conversion to scipy CSR may or may not be possible. Raise a type error if conversion fails.
+- FF: We can look forward to having its support in recent future if requested.
+- empty: To be filled in.
+
+************
+Table Header
+************
+- `X` means predictor matrix.
+- Meta info: label, weight, etc.
+- Multi Label: 2-dim label for multi-target.
+- Others: Anything else that we don't list here explicitly including formats like `lil`, `dia`, `bsr`. XGBoost will try to convert it into scipy csr.
+
+**************
+Support Matrix
+**************
+
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| Name                    | DMatrix X | QuantileDMatrix X | Sklearn X | Meta Info | Inplace prediction | Multi Label |
+=========================+===========+===================+===========+===========+====================+=============+
+| numpy.ndarray           | T         | T                 | T         | T         | T                  | T           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.csr        | T         | T                 | T         | NE        | T                  | F           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.csc        | T         | F                 | T         | NE        | F                  | F           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| scipy.sparse.coo        | SciCSR    | F                 | SciCSR    | NE        | F                  | F           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| uri                     | T         | F                 | F         | F         | NE                 | F           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| list                    | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| tuple                   | NPA       | NPA               | NPA       | NPA       | NPA                | T           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pandas.DataFrame        | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pandas.Series           | NPA       | NPA               | NPA       | NPA       | NPA                | NE          |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cudf.DataFrame          | T         | T                 | T         | T         | T                  | T           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cudf.Series             | T         | T                 | T         | T         | FF                 | NE          |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| cupy.ndarray            | T         | T                 | T         | T         | T                  | T           |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| dlpack                  | CPA       | CPA               |           | CPA       | FF                 | FF          |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| datatable.Frame         | T         | FF                |           | NPA       | FF                 |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| datatable.Table         | T         | FF                |           | NPA       | FF                 |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| modin.DataFrame         | NPA       | FF                | NPA       | NPA       | FF                 |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| modin.Series            | NPA       | FF                | NPA       | NPA       | FF                 |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pyarrow.Table           | T         | F                 |           | NPA       | FF                 |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| pyarrow.dataset.Dataset | T         | F                 |           |           | F                  |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
+| Others                  | SciCSR    | F                 |           | F         | F                  |             |
+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+

 Setting Parameters
 ------------------
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -619,11 +619,11 @@ class DataSplitMode(IntEnum):
 class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-methods
    """Data Matrix used in XGBoost.

-    DMatrix is an internal data structure that is used by XGBoost,
-    which is optimized for both memory efficiency and training speed.
-    You can construct DMatrix from multiple different sources of data.
-    """
+    DMatrix is an internal data structure that is used by XGBoost, which is optimized
+    for both memory efficiency and training speed.  You can construct DMatrix from
+    multiple different sources of data.

+    """
    @_deprecate_positional_args
    def __init__(
        self,
@@ -647,15 +647,9 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
    ) -> None:
        """Parameters
        ----------
-        data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
-               dt.Frame/cudf.DataFrame/cupy.array/dlpack/arrow.Table
-
-            Data source of DMatrix.
-
-            When data is string or os.PathLike type, it represents the path libsvm
-            format txt file, csv file (by specifying uri parameter
-            'path_to_csv?format=csv'), or binary file that xgboost can read from.
-
+        data :
+            Data source of DMatrix. See :ref:`py-data` for a list of supported input
+            types.
        label : array_like
            Label of the training data.
        weight : array_like
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -939,7 +939,14 @@ class XGBModel(XGBModelBase):
        Parameters
        ----------
        X :
-            Feature matrix
+            Feature matrix. See :ref:`py-data` for a list of supported types.
+
+            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
+            for conserving memory. However, this has performance implications when the
+            device of input data is not matched with algorithm. For instance, if the
+            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
+            the data is first processed on CPU then transferred to GPU.
        y :
            Labels
        sample_weight :
@@ -982,6 +989,7 @@ class XGBModel(XGBModelBase):
        callbacks :
            .. deprecated:: 1.6.0
                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.
+
        """
        with config_context(verbosity=self.verbosity):
            evals_result: TrainingCallback.EvalsLog = {}
@@ -1567,7 +1575,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase):
        Parameters
        ----------
        X : array_like
-            Feature matrix.
+            Feature matrix. See :ref:`py-data` for a list of supported types.
        ntree_limit : int
            Deprecated, use `iteration_range` instead.
        validate_features : bool
@@ -1846,7 +1854,14 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        Parameters
        ----------
        X :
-            Feature matrix
+            Feature matrix. See :ref:`py-data` for a list of supported types.
+
+            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
+            for conserving memory. However, this has performance implications when the
+            device of input data is not matched with algorithm. For instance, if the
+            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
+            the data is first processed on CPU then transferred to GPU.
        y :
            Labels
        group :
@@ -1917,6 +1932,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
        callbacks :
            .. deprecated:: 1.6.0
                Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead.
+
        """
        # check if group information is provided
        with config_context(verbosity=self.verbosity):
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1084,6 +1084,12 @@ def test_pandas_input():
    )
    np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1]))

+    train_ser = train["k1"]
+    assert isinstance(train_ser, pd.Series)
+    model = xgb.XGBClassifier(n_estimators=8)
+    model.fit(train_ser, target, eval_set=[(train_ser, target)])
+    assert tm.non_increasing(model.evals_result()["validation_0"]["logloss"])
+

@pytest.mark.parametrize("tree_method", ["approx", "hist"])
 def test_feature_weights(tree_method):
@@ -1239,6 +1245,10 @@ def test_multilabel_classification() -> None:
    np.testing.assert_allclose(clf.predict(X), predt)
    assert predt.dtype == np.int64

+    y = y.tolist()
+    clf.fit(X, y)
+    np.testing.assert_allclose(clf.predict(X), predt)
+

 def test_data_initialization():
    from sklearn.datasets import load_digits