Support for all primitive types from array. (#7003)

* Change C API name. * Test for all primitive types from array. * Add native support for CPU 128 float. * Convert boolean and float16 in Python. * Fix dask version for now.
2021-06-01 08:34:48 +08:00 · 2021-06-01 08:34:48 +08:00 · ee4f51a631
commit ee4f51a631
parent 816b789bf0
8 changed files with 154 additions and 24 deletions
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -142,7 +142,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
 * \param out created dmatrix
 * \return 0 when success, -1 when failure happens
 */
-XGB_DLL int XGDMatrixCreateFromArray(char const *data,
+XGB_DLL int XGDMatrixCreateFromDense(char const *data,
                                     char const *json_config,
                                     DMatrixHandle *out);
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -239,7 +239,18 @@ def _array_interface(data: np.ndarray) -> bytes:
    interface = data.__array_interface__
    if "mask" in interface:
        interface["mask"] = interface["mask"].__array_interface__
-    interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
+    interface_str = bytes(json.dumps(interface), "utf-8")
    return interface_str
 def _cuda_array_interface(data) -> bytes:
    assert (
        data.dtype.hasobject is False
    ), "Input data contains `object` dtype.  Expecting numeric data."
    interface = data.__cuda_array_interface__
    if "mask" in interface:
        interface["mask"] = interface["mask"].__cuda_array_interface__
    interface_str = bytes(json.dumps(interface), "utf-8")
    return interface_str
@ -1948,10 +1959,7 @@ class Booster(object):
            from .data import _transform_cupy_array
            data = _transform_cupy_array(data)
-            interface = data.__cuda_array_interface__
+            interface_str = _cuda_array_interface(data)
            if "mask" in interface:
                interface["mask"] = interface["mask"].__cuda_array_interface__
            interface_str = bytes(json.dumps(interface, indent=2), "utf-8")
            _check_call(
                _LIB.XGBoosterPredictFromCudaArray(
                    self.handle,
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@ -9,7 +9,8 @@ from typing import Any
 import numpy as np
-from .core import c_array, _LIB, _check_call, c_str, _array_interface
+from .core import c_array, _LIB, _check_call, c_str
 from .core import _array_interface, _cuda_array_interface
 from .core import DataIter, _ProxyDMatrix, DMatrix
 from .compat import lazy_isinstance
@ -105,7 +106,7 @@ def _is_numpy_array(data):
 def _ensure_np_dtype(data, dtype):
-    if data.dtype.hasobject:
+    if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
        data = data.astype(np.float32, copy=False)
        dtype = np.float32
    return data, dtype
@ -141,7 +142,7 @@ def _from_numpy_array(data, missing, nthread, feature_names, feature_types):
    }
    config = bytes(json.dumps(args), "utf-8")
    _check_call(
-        _LIB.XGDMatrixCreateFromArray(
+        _LIB.XGDMatrixCreateFromDense(
            _array_interface(data),
            config,
            ctypes.byref(handle),
@ -416,21 +417,19 @@ def _is_cupy_array(data):
 def _transform_cupy_array(data):
    import cupy  # pylint: disable=import-error
    if not hasattr(data, '__cuda_array_interface__') and hasattr(
            data, '__array__'):
        import cupy             # pylint: disable=import-error
        data = cupy.array(data, copy=False)
    if data.dtype.hasobject or data.dtype in [cupy.float16, cupy.bool_]:
        data = data.astype(cupy.float32, copy=False)
    return data
 def _from_cupy_array(data, missing, nthread, feature_names, feature_types):
    """Initialize DMatrix from cupy ndarray."""
    data = _transform_cupy_array(data)
-    interface = data.__cuda_array_interface__
+    interface_str = _cuda_array_interface(data)
    if 'mask' in interface:
        interface['mask'] = interface['mask'].__cuda_array_interface__
    interface_str = bytes(json.dumps(interface, indent=2), 'utf-8')
    handle = ctypes.c_void_p()
    _check_call(
        _LIB.XGDMatrixCreateFromArrayInterface(
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -261,7 +261,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr,
  API_END();
 }
-XGB_DLL int XGDMatrixCreateFromArray(char const *data,
+XGB_DLL int XGDMatrixCreateFromDense(char const *data,
                                     char const *c_json_config,
                                     DMatrixHandle *out) {
  API_BEGIN();
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@ -42,7 +42,8 @@ struct ArrayInterfaceErrors {
    return str.c_str();
  }
  static char const* Version() {
-    return "Only version <= 3 of `__cuda_array_interface__' are supported.";
+    return "Only version <= 3 of "
           "`__cuda_array_interface__/__array_interface__' are supported.";
  }
  static char const* OfType(std::string const& type) {
    static std::string str;
@ -81,7 +82,7 @@ struct ArrayInterfaceErrors {
        return "Other";
      default:
        LOG(FATAL) << "Invalid type code: " << c << " in `typestr' of input array."
-                   << "\nPlease verify the `__cuda_array_interface__' "
+                   << "\nPlease verify the `__cuda_array_interface__/__array_interface__' "
                   << "of your input data complies to: "
                   << "https://docs.scipy.org/doc/numpy/reference/arrays.interface.html"
                   << "\nOr open an issue.";
@ -90,7 +91,7 @@ struct ArrayInterfaceErrors {
  }
  static std::string UnSupportedType(StringView typestr) {
-    return TypeStr(typestr[1]) + " is not supported.";
+    return TypeStr(typestr[1]) + "-" + typestr[2] + " is not supported.";
  }
 };
@ -135,8 +136,9 @@ class ArrayInterfaceHandler {
    if (array.find("typestr") == array.cend()) {
      LOG(FATAL) << "Missing `typestr' field for array interface";
    }
    auto typestr = get<String const>(array.at("typestr"));
-    CHECK_EQ(typestr.size(),    3) << ArrayInterfaceErrors::TypestrFormat();
+    CHECK(typestr.size() == 3 || typestr.size() == 4) << ArrayInterfaceErrors::TypestrFormat();
    CHECK_NE(typestr.front(), '>') << ArrayInterfaceErrors::BigEndian();
    if (array.find("shape") == array.cend()) {
@ -295,7 +297,7 @@ class ArrayInterface {
  }
 public:
-  enum Type : std::int8_t { kF4, kF8, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
+  enum Type : std::int8_t { kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
 public:
  ArrayInterface() = default;
@ -331,7 +333,12 @@ class ArrayInterface {
  }
  void AssignType(StringView typestr) {
-    if (typestr[1] == 'f' && typestr[2] == '4') {
+    if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' &&
        typestr[3] == '6') {
      type = kF16;
      CHECK(sizeof(long double) == 16)
          << "128-bit floating point is not supported on current platform.";
    } else if (typestr[1] == 'f' && typestr[2] == '4') {
      type = kF4;
    } else if (typestr[1] == 'f' && typestr[2] == '8') {
      type = kF8;
@ -364,6 +371,16 @@ class ArrayInterface {
      return func(reinterpret_cast<float *>(data));
    case kF8:
      return func(reinterpret_cast<double *>(data));
 #ifdef __CUDA_ARCH__
    case kF16: {
      // CUDA device code doesn't support long double.
      SPAN_CHECK(false);
      return func(reinterpret_cast<double *>(data));
    }
 #else
    case kF16:
      return func(reinterpret_cast<long double *>(data));
 #endif
    case kI1:
      return func(reinterpret_cast<int8_t *>(data));
    case kI2:
--- a/tests/ci_build/conda_env/macos_cpu_test.yml
+++ b/tests/ci_build/conda_env/macos_cpu_test.yml
@ -13,8 +13,8 @@ dependencies:
 - scikit-learn
 - pandas
 - matplotlib
- dask
+- dask=2021.05.0
- distributed
+- distributed=2021.05.0
 - graphviz
 - python-graphviz
 - hypothesis
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@ -204,6 +204,7 @@ class TestGPUPredict:
        cpu_predt = reg.predict(X)
        np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.skipif(**tm.no_cudf())
    def test_inplace_predict_cudf(self):
        import cupy as cp
@ -332,6 +333,7 @@ class TestGPUPredict:
        rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
        np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
    @pytest.mark.skipif(**tm.no_cupy())
    @pytest.mark.parametrize("n_classes", [2, 3])
    def test_predict_dart(self, n_classes):
        from sklearn.datasets import make_classification
@ -378,3 +380,59 @@ class TestGPUPredict:
        copied = cp.array(copied)
        cp.testing.assert_allclose(inplace, copied, atol=1e-6)
    @pytest.mark.skipif(**tm.no_cupy())
    def test_dtypes(self):
        import cupy as cp
        rows = 1000
        cols = 10
        rng = cp.random.RandomState(1994)
        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(
            rows, cols
        )
        y = rng.randint(low=0, high=127, size=rows)
        dtrain = xgb.DMatrix(orig, label=y)
        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
        predt_orig = booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
            cp.signedinteger,
            cp.byte,
            cp.short,
            cp.intc,
            cp.int_,
            cp.longlong,
            cp.unsignedinteger,
            cp.ubyte,
            cp.ushort,
            cp.uintc,
            cp.uint,
            cp.ulonglong,
            cp.floating,
            cp.half,
            cp.single,
            cp.double,
        ]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)
        # boolean
        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(
            rows, cols
        )
        predt_orig = booster.inplace_predict(orig)
        for dtype in [cp.bool8, cp.bool_]:
            X = cp.array(orig, dtype=dtype)
            predt = booster.inplace_predict(X)
            cp.testing.assert_allclose(predt, predt_orig)
        # unsupported types
        for dtype in [
            cp.complex64,
            cp.complex128,
        ]:
            X = cp.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                booster.inplace_predict(X)
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@ -237,3 +237,51 @@ class TestInplacePredict:
        dtrain = xgb.DMatrix(self.X, self.y, base_margin=base_margin)
        from_dmatrix = booster.predict(dtrain)
        np.testing.assert_allclose(from_dmatrix, from_inplace)
    def test_dtypes(self):
        orig = self.rng.randint(low=0, high=127, size=self.rows * self.cols).reshape(
            self.rows, self.cols
        )
        predt_orig = self.booster.inplace_predict(orig)
        # all primitive types in numpy
        for dtype in [
            np.signedinteger,
            np.byte,
            np.short,
            np.intc,
            np.int_,
            np.longlong,
            np.unsignedinteger,
            np.ubyte,
            np.ushort,
            np.uintc,
            np.uint,
            np.ulonglong,
            np.floating,
            np.half,
            np.single,
            np.double,
        ]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)
        # boolean
        orig = self.rng.binomial(1, 0.5, size=self.rows * self.cols).reshape(
            self.rows, self.cols
        )
        predt_orig = self.booster.inplace_predict(orig)
        for dtype in [np.bool8, np.bool_]:
            X = np.array(orig, dtype=dtype)
            predt = self.booster.inplace_predict(X)
            np.testing.assert_allclose(predt, predt_orig)
        # unsupported types
        for dtype in [
            np.string_,
            np.complex64,
            np.complex128,
        ]:
            X = np.array(orig, dtype=dtype)
            with pytest.raises(ValueError):
                self.booster.inplace_predict(X)