From 1b58d813152d30eab12907f29e08b0b5233857c4 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 10 Jan 2023 15:39:32 +0800 Subject: [PATCH] [doc] Document Python inputs. (#8643) --- doc/python/python_intro.rst | 94 +++++++++++++++++++++++++------ python-package/xgboost/core.py | 20 +++---- python-package/xgboost/sklearn.py | 22 +++++++- tests/python/test_with_sklearn.py | 10 ++++ 4 files changed, 113 insertions(+), 33 deletions(-) diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst index 7951fb3d4..c36db91ff 100644 --- a/doc/python/python_intro.rst +++ b/doc/python/python_intro.rst @@ -32,24 +32,9 @@ To verify your installation, run the following in Python: Data Interface -------------- -The XGBoost python module is able to load data from many different types of data format, -including: +The XGBoost Python module is able to load data from many different types of data format including both CPU and GPU data structures. For a complete list of supported data types, please reference the :ref:`py-data`. For a detailed description of text input formats, please visit :doc:`/tutorials/input_format`. -- NumPy 2D array -- SciPy 2D sparse array -- Pandas data frame -- cuDF DataFrame -- cupy 2D array -- dlpack -- datatable -- XGBoost binary buffer file. -- LIBSVM text format file -- Comma-separated values (CSV) file -- Arrow table. - -(See :doc:`/tutorials/input_format` for detailed description of text input format.) - -The data is stored in a :py:class:`DMatrix ` object. +The input data is stored in a :py:class:`DMatrix ` object. For the sklearn estimator interface, a :py:class:`DMatrix` or a :py:class:`QuantileDMatrix` is created depending on the chosen algorithm and the input, see the sklearn API reference for details. We will illustrate some of the basic input types with the ``DMatrix`` here. * To load a NumPy array into :py:class:`DMatrix `: @@ -120,6 +105,81 @@ to number of groups. recommended to use pandas ``read_csv`` or other similar utilites than XGBoost's builtin parser. +.. _py-data: + +Supported data structures for various XGBoost functions +======================================================= + +******* +Markers +******* + +- T: Supported. +- F: Not supported. +- NE: Invalid type for the use case. For instance, `pd.Series` can not be multi-target label. +- NPA: Support with the help of numpy array. +- CPA: Support with the help of cupy array. +- SciCSR: Support with the help of scripy sparse CSR. The conversion to scipy CSR may or may not be possible. Raise a type error if conversion fails. +- FF: We can look forward to having its support in recent future if requested. +- empty: To be filled in. + +************ +Table Header +************ +- `X` means predictor matrix. +- Meta info: label, weight, etc. +- Multi Label: 2-dim label for multi-target. +- Others: Anything else that we don't list here explicitly including formats like `lil`, `dia`, `bsr`. XGBoost will try to convert it into scipy csr. + +************** +Support Matrix +************** + ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| Name | DMatrix X | QuantileDMatrix X | Sklearn X | Meta Info | Inplace prediction | Multi Label | ++=========================+===========+===================+===========+===========+====================+=============+ +| numpy.ndarray | T | T | T | T | T | T | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| scipy.sparse.csr | T | T | T | NE | T | F | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| scipy.sparse.csc | T | F | T | NE | F | F | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| scipy.sparse.coo | SciCSR | F | SciCSR | NE | F | F | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| uri | T | F | F | F | NE | F | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| list | NPA | NPA | NPA | NPA | NPA | T | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| tuple | NPA | NPA | NPA | NPA | NPA | T | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| pandas.DataFrame | NPA | NPA | NPA | NPA | NPA | NPA | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| pandas.Series | NPA | NPA | NPA | NPA | NPA | NE | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| cudf.DataFrame | T | T | T | T | T | T | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| cudf.Series | T | T | T | T | FF | NE | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| cupy.ndarray | T | T | T | T | T | T | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| dlpack | CPA | CPA | | CPA | FF | FF | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| datatable.Frame | T | FF | | NPA | FF | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| datatable.Table | T | FF | | NPA | FF | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| modin.DataFrame | NPA | FF | NPA | NPA | FF | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| modin.Series | NPA | FF | NPA | NPA | FF | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| pyarrow.Table | T | F | | NPA | FF | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| pyarrow.dataset.Dataset | T | F | | | F | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| _\_array\_\_ | NPA | F | NPA | NPA | H | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ +| Others | SciCSR | F | | F | F | | ++-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+ Setting Parameters ------------------ diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index e63bc470c..b5bafe453 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -619,11 +619,11 @@ class DataSplitMode(IntEnum): class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-methods """Data Matrix used in XGBoost. - DMatrix is an internal data structure that is used by XGBoost, - which is optimized for both memory efficiency and training speed. - You can construct DMatrix from multiple different sources of data. - """ + DMatrix is an internal data structure that is used by XGBoost, which is optimized + for both memory efficiency and training speed. You can construct DMatrix from + multiple different sources of data. + """ @_deprecate_positional_args def __init__( self, @@ -647,15 +647,9 @@ class DMatrix: # pylint: disable=too-many-instance-attributes,too-many-public-m ) -> None: """Parameters ---------- - data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/ - dt.Frame/cudf.DataFrame/cupy.array/dlpack/arrow.Table - - Data source of DMatrix. - - When data is string or os.PathLike type, it represents the path libsvm - format txt file, csv file (by specifying uri parameter - 'path_to_csv?format=csv'), or binary file that xgboost can read from. - + data : + Data source of DMatrix. See :ref:`py-data` for a list of supported input + types. label : array_like Label of the training data. weight : array_like diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index ecd3957d1..5ca9770a4 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -939,7 +939,14 @@ class XGBModel(XGBModelBase): Parameters ---------- X : - Feature matrix + Feature matrix. See :ref:`py-data` for a list of supported types. + + When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the + :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix` + for conserving memory. However, this has performance implications when the + device of input data is not matched with algorithm. For instance, if the + input is a numpy array on CPU but ``gpu_hist`` is used for training, then + the data is first processed on CPU then transferred to GPU. y : Labels sample_weight : @@ -982,6 +989,7 @@ class XGBModel(XGBModelBase): callbacks : .. deprecated:: 1.6.0 Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead. + """ with config_context(verbosity=self.verbosity): evals_result: TrainingCallback.EvalsLog = {} @@ -1567,7 +1575,7 @@ class XGBClassifier(XGBModel, XGBClassifierBase): Parameters ---------- X : array_like - Feature matrix. + Feature matrix. See :ref:`py-data` for a list of supported types. ntree_limit : int Deprecated, use `iteration_range` instead. validate_features : bool @@ -1846,7 +1854,14 @@ class XGBRanker(XGBModel, XGBRankerMixIn): Parameters ---------- X : - Feature matrix + Feature matrix. See :ref:`py-data` for a list of supported types. + + When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the + :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix` + for conserving memory. However, this has performance implications when the + device of input data is not matched with algorithm. For instance, if the + input is a numpy array on CPU but ``gpu_hist`` is used for training, then + the data is first processed on CPU then transferred to GPU. y : Labels group : @@ -1917,6 +1932,7 @@ class XGBRanker(XGBModel, XGBRankerMixIn): callbacks : .. deprecated:: 1.6.0 Use `callbacks` in :py:meth:`__init__` or :py:meth:`set_params` instead. + """ # check if group information is provided with config_context(verbosity=self.verbosity): diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index ae62fe3e4..f7f4e1f32 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1084,6 +1084,12 @@ def test_pandas_input(): ) np.testing.assert_allclose(np.array(clf_isotonic.classes_), np.array([0, 1])) + train_ser = train["k1"] + assert isinstance(train_ser, pd.Series) + model = xgb.XGBClassifier(n_estimators=8) + model.fit(train_ser, target, eval_set=[(train_ser, target)]) + assert tm.non_increasing(model.evals_result()["validation_0"]["logloss"]) + @pytest.mark.parametrize("tree_method", ["approx", "hist"]) def test_feature_weights(tree_method): @@ -1239,6 +1245,10 @@ def test_multilabel_classification() -> None: np.testing.assert_allclose(clf.predict(X), predt) assert predt.dtype == np.int64 + y = y.tolist() + clf.fit(X, y) + np.testing.assert_allclose(clf.predict(X), predt) + def test_data_initialization(): from sklearn.datasets import load_digits