Thread safe, inplace prediction. (#5389)

Normal prediction with DMatrix is now thread safe with locks. Added inplace prediction is lock free thread safe. When data is on device (cupy, cudf), the returned data is also on device. * Implementation for numpy, csr, cudf and cupy. * Implementation for dask. * Remove sync in simple dmatrix.
2020-03-30 15:35:28 +08:00
parent 7f980e9f83
commit 6601a641d7
25 changed files with 1217 additions and 167 deletions
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -177,9 +177,8 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
  }
 }

-void RandomDataGenerator::GenerateArrayInterface(
-    HostDeviceVector<float> *storage, std::string *out) const {
-  CHECK(out);
+Json RandomDataGenerator::ArrayInterfaceImpl(HostDeviceVector<float> *storage,
+                                             size_t rows, size_t cols) const {
  this->GenerateDense(storage);
  Json array_interface {Object()};
  array_interface["data"] = std::vector<Json>(2);
@@ -187,13 +186,37 @@ void RandomDataGenerator::GenerateArrayInterface(
  array_interface["data"][1] = Boolean(false);

  array_interface["shape"] = std::vector<Json>(2);
-  array_interface["shape"][0] = rows_;
-  array_interface["shape"][1] = cols_;
+  array_interface["shape"][0] = rows;
+  array_interface["shape"][1] = cols;

  array_interface["typestr"] = String("<f4");
  array_interface["version"] = 1;
+  return array_interface;
+}

-  Json::Dump(array_interface, out);
+std::string RandomDataGenerator::GenerateArrayInterface(
+    HostDeviceVector<float> *storage) const {
+  auto array_interface = this->ArrayInterfaceImpl(storage, rows_, cols_);
+  std::string out;
+  Json::Dump(array_interface, &out);
+  return out;
+}
+
+
+
+std::string RandomDataGenerator::GenerateColumnarArrayInterface(
+    std::vector<HostDeviceVector<float>> *data) const {
+  CHECK(data);
+  CHECK_EQ(data->size(), cols_);
+  auto& storage = *data;
+  Json arr { Array() };
+  for (size_t i = 0; i < cols_; ++i) {
+    auto column = this->ArrayInterfaceImpl(&storage[i], rows_, 1);
+    get<Array>(arr).emplace_back(column);
+  }
+  std::string out;
+  Json::Dump(arr, &out);
+  return out;
 }

 void RandomDataGenerator::GenerateCSR(
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -181,6 +181,9 @@ class RandomDataGenerator {
  int32_t device_;
  int32_t seed_;

+  Json ArrayInterfaceImpl(HostDeviceVector<float> *storage, size_t rows,
+                          size_t cols) const;
+
 public:
  RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
      : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lower_{0.0f}, upper_{1.0f},
@@ -204,7 +207,9 @@ class RandomDataGenerator {
  }

  void GenerateDense(HostDeviceVector<float>* out) const;
-  void GenerateArrayInterface(HostDeviceVector<float>* storage, std::string* out) const;
+  std::string GenerateArrayInterface(HostDeviceVector<float>* storage) const;
+  std::string GenerateColumnarArrayInterface(
+      std::vector<HostDeviceVector<float>> *data) const;
  void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                   HostDeviceVector<bst_feature_t>* columns) const;

--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -6,7 +6,9 @@
 #include <xgboost/predictor.h>

 #include "../helpers.h"
+#include "test_predictor.h"
 #include "../../../src/gbm/gbtree_model.h"
+#include "../../../src/data/adapter.h"

 namespace xgboost {
 TEST(CpuPredictor, Basic) {
@@ -138,4 +140,27 @@ TEST(CpuPredictor, ExternalMemory) {
    }
  }
 }
+
+TEST(CpuPredictor, InplacePredict) {
+  bst_row_t constexpr kRows{128};
+  bst_feature_t constexpr kCols{64};
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1);
+  {
+    HostDeviceVector<float> data;
+    gen.GenerateDense(&data);
+    ASSERT_EQ(data.Size(), kRows * kCols);
+    data::DenseAdapter x{data.HostPointer(), kRows, kCols};
+    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1);
+  }
+
+  {
+    HostDeviceVector<float> data;
+    HostDeviceVector<bst_row_t> rptrs;
+    HostDeviceVector<bst_feature_t> columns;
+    gen.GenerateCSR(&data, &rptrs, &columns);
+    data::CSRAdapter x(rptrs.HostPointer(), columns.HostPointer(),
+                       data.HostPointer(), kRows, data.Size(), kCols);
+    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, -1);
+  }
+}
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -1,16 +1,17 @@
 /*!
 * Copyright 2017-2020 XGBoost contributors
 */
+#include <gtest/gtest.h>
 #include <dmlc/filesystem.h>
 #include <xgboost/c_api.h>
 #include <xgboost/predictor.h>
 #include <xgboost/logging.h>
 #include <xgboost/learner.h>
-
 #include <string>
-#include "gtest/gtest.h"
+
 #include "../helpers.h"
 #include "../../../src/gbm/gbtree_model.h"
+#include "../../../src/data/device_adapter.cuh"
 #include "test_predictor.h"

 namespace xgboost {
@@ -104,5 +105,43 @@ TEST(GPUPredictor, ExternalMemoryTest) {
    }
  }
 }
+
+TEST(GPUPredictor, InplacePredictCupy) {
+  size_t constexpr kRows{128}, kCols{64};
+  RandomDataGenerator gen(kRows, kCols, 0.5);
+  gen.Device(0);
+  HostDeviceVector<float> data;
+  std::string interface_str = gen.GenerateArrayInterface(&data);
+  data::CupyAdapter x{interface_str};
+  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0);
+}
+
+TEST(GPUPredictor, InplacePredictCuDF) {
+  size_t constexpr kRows{128}, kCols{64};
+  RandomDataGenerator gen(kRows, kCols, 0.5);
+  gen.Device(0);
+  std::vector<HostDeviceVector<float>> storage(kCols);
+  auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
+  data::CudfAdapter x {interface_str};
+  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0);
+}
+
+TEST(GPUPredictor, MGPU_InplacePredict) {
+  int32_t n_gpus = xgboost::common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    LOG(WARNING) << "GPUPredictor.MGPU_InplacePredict is skipped.";
+    return;
+  }
+  size_t constexpr kRows{128}, kCols{64};
+  RandomDataGenerator gen(kRows, kCols, 0.5);
+  gen.Device(1);
+  HostDeviceVector<float> data;
+  std::string interface_str = gen.GenerateArrayInterface(&data);
+  data::CupyAdapter x{interface_str};
+  TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 1);
+  EXPECT_THROW(TestInplacePrediction(x, "gpu_predictor", kRows, kCols, 0),
+               dmlc::Error);
+}
+
 }  // namespace predictor
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -77,4 +77,59 @@ void TestTrainingPrediction(size_t rows, std::string tree_method) {
                predictions_0.ConstHostVector()[i], kRtEps);
  }
 }
+
+void TestInplacePrediction(dmlc::any x, std::string predictor,
+                           bst_row_t rows, bst_feature_t cols,
+                           int32_t device) {
+  size_t constexpr kClasses { 4 };
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(device);
+  std::shared_ptr<DMatrix> m = gen.GenerateDMatix(true, false, kClasses);
+
+  std::unique_ptr<Learner> learner {
+    Learner::Create({m})
+  };
+
+  learner->SetParam("num_parallel_tree", "4");
+  learner->SetParam("num_class", std::to_string(kClasses));
+  learner->SetParam("seed", "0");
+  learner->SetParam("subsample", "0.5");
+  learner->SetParam("gpu_id", std::to_string(device));
+  learner->SetParam("predictor", predictor);
+  for (int32_t it = 0; it < 4; ++it) {
+    learner->UpdateOneIter(it, m);
+  }
+
+  HostDeviceVector<float> *p_out_predictions_0{nullptr};
+  learner->InplacePredict(x, "margin", std::numeric_limits<float>::quiet_NaN(),
+                          &p_out_predictions_0, 0, 2);
+  CHECK(p_out_predictions_0);
+  HostDeviceVector<float> predict_0 (p_out_predictions_0->Size());
+  predict_0.Copy(*p_out_predictions_0);
+
+  HostDeviceVector<float> *p_out_predictions_1{nullptr};
+  learner->InplacePredict(x, "margin", std::numeric_limits<float>::quiet_NaN(),
+                          &p_out_predictions_1, 2, 4);
+  CHECK(p_out_predictions_1);
+  HostDeviceVector<float> predict_1 (p_out_predictions_1->Size());
+  predict_1.Copy(*p_out_predictions_1);
+
+  HostDeviceVector<float>* p_out_predictions{nullptr};
+  learner->InplacePredict(x, "margin", std::numeric_limits<float>::quiet_NaN(),
+                          &p_out_predictions, 0, 4);
+
+  auto& h_pred = p_out_predictions->HostVector();
+  auto& h_pred_0 = predict_0.HostVector();
+  auto& h_pred_1 = predict_1.HostVector();
+
+  ASSERT_EQ(h_pred.size(), rows * kClasses);
+  ASSERT_EQ(h_pred.size(), h_pred_0.size());
+  ASSERT_EQ(h_pred.size(), h_pred_1.size());
+  for (size_t i = 0; i < h_pred.size(); ++i) {
+    // Need to remove the global bias here.
+    ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
+  }
+
+  learner->SetParam("gpu_id", "-1");
+  learner->Configure();
+}
 }  // namespace xgboost
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -58,6 +58,9 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, int32_t bins

 void TestTrainingPrediction(size_t rows, std::string tree_method);

+void TestInplacePrediction(dmlc::any x, std::string predictor,
+                           bst_row_t rows, bst_feature_t cols,
+                           int32_t device = -1);
 }  // namespace xgboost

 #endif  // XGBOOST_TEST_PREDICTOR_H_
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -1,8 +1,12 @@
-from __future__ import print_function
+import sys
+import unittest
+import pytest

 import numpy as np
-import unittest
 import xgboost as xgb
+sys.path.append("tests/python")
+import testing as tm
+from test_predict import run_threaded_predict  # noqa

 rng = np.random.RandomState(1994)

@@ -111,3 +115,65 @@ class TestGPUPredict(unittest.TestCase):

        assert np.allclose(cpu_train_score, gpu_train_score)
        assert np.allclose(cpu_test_score, gpu_test_score)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_inplace_predict_cupy(self):
+        import cupy as cp
+        rows = 1000
+        cols = 10
+        cp_rng = cp.random.RandomState(1994)
+        cp.random.set_random_state(cp_rng)
+        X = cp.random.randn(rows, cols)
+        y = cp.random.randn(rows)
+
+        dtrain = xgb.DMatrix(X, y)
+
+        booster = xgb.train({'tree_method': 'gpu_hist'},
+                            dtrain, num_boost_round=10)
+        test = xgb.DMatrix(X[:10, ...])
+        predt_from_array = booster.inplace_predict(X[:10, ...])
+        predt_from_dmatrix = booster.predict(test)
+
+        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)
+
+        def predict_dense(x):
+            inplace_predt = booster.inplace_predict(x)
+            d = xgb.DMatrix(x)
+            copied_predt = cp.array(booster.predict(d))
+            return cp.all(copied_predt == inplace_predt)
+
+        for i in range(10):
+            run_threaded_predict(X, rows, predict_dense)
+
+    @pytest.mark.skipif(**tm.no_cudf())
+    def test_inplace_predict_cudf(self):
+        import cupy as cp
+        import cudf
+        import pandas as pd
+        rows = 1000
+        cols = 10
+        rng = np.random.RandomState(1994)
+        X = rng.randn(rows, cols)
+        X = pd.DataFrame(X)
+        y = rng.randn(rows)
+
+        X = cudf.from_pandas(X)
+
+        dtrain = xgb.DMatrix(X, y)
+
+        booster = xgb.train({'tree_method': 'gpu_hist'},
+                            dtrain, num_boost_round=10)
+        test = xgb.DMatrix(X)
+        predt_from_array = booster.inplace_predict(X)
+        predt_from_dmatrix = booster.predict(test)
+
+        cp.testing.assert_allclose(predt_from_array, predt_from_dmatrix)
+
+        def predict_df(x):
+            inplace_predt = booster.inplace_predict(x)
+            d = xgb.DMatrix(x)
+            copied_predt = cp.array(booster.predict(d))
+            return cp.all(copied_predt == inplace_predt)
+
+        for i in range(10):
+            run_threaded_predict(X, rows, predict_df)
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -2,6 +2,7 @@ import sys
 import pytest
 import numpy as np
 import unittest
+import xgboost

 if sys.platform.startswith("win"):
    pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
@@ -29,6 +30,7 @@ class TestDistributedGPU(unittest.TestCase):
    def test_dask_dataframe(self):
        with LocalCUDACluster() as cluster:
            with Client(cluster) as client:
+                import cupy
                X, y = generate_array()

                X = dd.from_dask_array(X)
@@ -49,6 +51,42 @@ class TestDistributedGPU(unittest.TestCase):
                predictions = dxgb.predict(client, out, dtrain).compute()
                assert isinstance(predictions, np.ndarray)

+                # There's an error with cudf saying `concat_cudf` got an
+                # expected argument `ignore_index`.  So the test here is just
+                # place holder.
+
+                # series_predictions = dxgb.inplace_predict(client, out, X)
+                # assert isinstance(series_predictions, dd.Series)
+
+                single_node = out['booster'].predict(
+                    xgboost.DMatrix(X.compute()))
+                cupy.testing.assert_allclose(single_node, predictions)
+
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_dask_array(self):
+        with LocalCUDACluster() as cluster:
+            with Client(cluster) as client:
+                import cupy
+                X, y = generate_array()
+
+                X = X.map_blocks(cupy.asarray)
+                y = y.map_blocks(cupy.asarray)
+                dtrain = dxgb.DaskDMatrix(client, X, y)
+                out = dxgb.train(client, {'tree_method': 'gpu_hist'},
+                                 dtrain=dtrain,
+                                 evals=[(dtrain, 'X')],
+                                 num_boost_round=2)
+                from_dmatrix = dxgb.predict(client, out, dtrain).compute()
+                inplace_predictions = dxgb.inplace_predict(
+                    client, out, X).compute()
+                single_node = out['booster'].predict(
+                    xgboost.DMatrix(X.compute()))
+                np.testing.assert_allclose(single_node, from_dmatrix)
+                cupy.testing.assert_allclose(
+                    cupy.array(single_node),
+                    inplace_predictions)
+
+
    @pytest.mark.skipif(**tm.no_dask())
    @pytest.mark.skipif(**tm.no_dask_cuda())
    @pytest.mark.mgpu
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -0,0 +1,63 @@
+'''Tests for running inplace prediction.'''
+import unittest
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from scipy import sparse
+
+import xgboost as xgb
+
+
+def run_threaded_predict(X, rows, predict_func):
+    results = []
+    per_thread = 20
+    with ThreadPoolExecutor(max_workers=10) as e:
+        for i in range(0, rows, int(rows / per_thread)):
+            try:
+                predictor = X[i:i+per_thread, ...]
+            except TypeError:
+                predictor = X.iloc[i:i+per_thread, ...]
+            f = e.submit(predict_func, predictor)
+            results.append(f)
+
+    for f in results:
+        assert f.result()
+
+
+class TestInplacePredict(unittest.TestCase):
+    '''Tests for running inplace prediction'''
+    def test_predict(self):
+        rows = 1000
+        cols = 10
+
+        np.random.seed(1994)
+
+        X = np.random.randn(rows, cols)
+        y = np.random.randn(rows)
+        dtrain = xgb.DMatrix(X, y)
+
+        booster = xgb.train({'tree_method': 'hist'},
+                            dtrain, num_boost_round=10)
+
+        test = xgb.DMatrix(X[:10, ...])
+        predt_from_array = booster.inplace_predict(X[:10, ...])
+        predt_from_dmatrix = booster.predict(test)
+
+        np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
+
+        def predict_dense(x):
+            inplace_predt = booster.inplace_predict(x)
+            d = xgb.DMatrix(x)
+            copied_predt = booster.predict(d)
+            return np.all(copied_predt == inplace_predt)
+
+        for i in range(10):
+            run_threaded_predict(X, rows, predict_dense)
+
+        def predict_csr(x):
+            inplace_predt = booster.inplace_predict(sparse.csr_matrix(x))
+            d = xgb.DMatrix(x)
+            copied_predt = booster.predict(d)
+            return np.all(copied_predt == inplace_predt)
+
+        for i in range(10):
+            run_threaded_predict(X, rows, predict_csr)
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -63,8 +63,14 @@ def test_from_dask_dataframe():
            from_df = prediction.compute()

            assert isinstance(prediction, dd.Series)
+            assert np.all(prediction.compute().values == from_dmatrix)
            assert np.all(from_dmatrix == from_df.to_numpy())

+            series_predictions = xgb.dask.inplace_predict(client, booster, X)
+            assert isinstance(series_predictions, dd.Series)
+            np.testing.assert_allclose(series_predictions.compute().values,
+                                       from_dmatrix)
+

 def test_from_dask_array():
    with LocalCluster(n_workers=5, threads_per_worker=5) as cluster: