Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation (#2530)

* Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation from numpy arrays for python interface.
2017-07-20 19:43:17 -07:00
parent 56550ff3f1
commit 6b375f6ad8
9 changed files with 324 additions and 73 deletions
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -290,6 +290,7 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t* col_ptr,
  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());

  API_BEGIN();
+  // FIXME: User should be able to control number of threads
  const int nthread = omp_get_max_threads();
  data::SimpleCSRSource& mat = *source;
  common::ParallelGroupBuilder<RowBatch::Entry> builder(&mat.row_ptr_, &mat.row_data_);
@@ -350,24 +351,159 @@ XGB_DLL int XGDMatrixCreateFromMat(const bst_float* data,

  API_BEGIN();
  data::SimpleCSRSource& mat = *source;
+  mat.row_ptr_.resize(1+nrow);
  bool nan_missing = common::CheckNAN(missing);
  mat.info.num_row = nrow;
  mat.info.num_col = ncol;
+  const bst_float* data0 = data;
+
+  // count elements for sizing data
+  data = data0;
  for (xgboost::bst_ulong i = 0; i < nrow; ++i, data += ncol) {
    xgboost::bst_ulong nelem = 0;
    for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
      if (common::CheckNAN(data[j])) {
        CHECK(nan_missing)
-            << "There are NAN in the matrix, however, you did not set missing=NAN";
+          << "There are NAN in the matrix, however, you did not set missing=NAN";
      } else {
        if (nan_missing || data[j] != missing) {
-          mat.row_data_.push_back(RowBatch::Entry(j, data[j]));
          ++nelem;
        }
      }
    }
-    mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
+    mat.row_ptr_[i+1] = mat.row_ptr_[i] + nelem;
  }
+  mat.row_data_.resize(mat.row_data_.size() + mat.row_ptr_.back());
+
+  data = data0;
+  for (xgboost::bst_ulong i = 0; i < nrow; ++i, data += ncol) {
+    xgboost::bst_ulong matj = 0;
+    for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
+      if (common::CheckNAN(data[j])) {
+      } else {
+        if (nan_missing || data[j] != missing) {
+          mat.row_data_[mat.row_ptr_[i] + matj] = RowBatch::Entry(j, data[j]);
+          ++matj;
+        }
+      }
+    }
+  }
+
+  mat.info.num_nonzero = mat.row_data_.size();
+  *out  = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
+  API_END();
+}
+
+void prefixsum_inplace(size_t *x, size_t N) {
+  size_t *suma;
+#pragma omp parallel
+  {
+    const int ithread = omp_get_thread_num();
+    const int nthreads = omp_get_num_threads();
+#pragma omp single
+    {
+      suma = new size_t[nthreads+1];
+      suma[0] = 0;
+    }
+    size_t sum = 0;
+#pragma omp for schedule(static)
+    for (omp_ulong i = 0; i < N; i++) {
+      sum += x[i];
+      x[i] = sum;
+    }
+    suma[ithread+1] = sum;
+#pragma omp barrier
+    size_t offset = 0;
+    for (omp_ulong i = 0; i < (ithread+1); i++) {
+      offset += suma[i];
+    }
+#pragma omp for schedule(static)
+    for (omp_ulong i = 0; i < N; i++) {
+      x[i] += offset;
+    }
+  }
+  delete[] suma;
+}
+
+
+XGB_DLL int XGDMatrixCreateFromMat_omp(const bst_float* data,
+                                       xgboost::bst_ulong nrow,
+                                       xgboost::bst_ulong ncol,
+                                       bst_float missing,
+                                       DMatrixHandle* out,
+                                       int nthread) {
+  // avoid openmp unless enough data to be worth it to avoid overhead costs
+  if (nrow*ncol <= 10000*50) {
+    return(XGDMatrixCreateFromMat(data, nrow, ncol, missing, out));
+  }
+
+  API_BEGIN();
+  const int nthreadmax = std::max(omp_get_num_procs() / 2 - 1, 1);
+  //  const int nthreadmax = omp_get_max_threads();
+  if (nthread <= 0) nthread=nthreadmax;
+  omp_set_num_threads(nthread);
+  xgboost::bst_ulong nrow_reserve_per_thread = std::ceil(nrow/static_cast<double>(nthread));
+
+  std::unique_ptr<data::SimpleCSRSource> source(new data::SimpleCSRSource());
+  data::SimpleCSRSource& mat = *source;
+  mat.row_ptr_.resize(1+nrow);
+  mat.info.num_row = nrow;
+  mat.info.num_col = ncol;
+
+  // Check for errors in missing elements
+  // Count elements per row (to avoid otherwise need to copy)
+  bool nan_missing = common::CheckNAN(missing);
+  int *badnan;
+  badnan = new int[nthread];
+  for (int i = 0; i < nthread; i++) {
+    badnan[i] = 0;
+  }
+
+#pragma omp parallel num_threads(nthread)
+  {
+    int ithread  = omp_get_thread_num();
+
+    // Count elements per row
+#pragma omp for schedule(static)
+    for (omp_ulong i = 0; i < nrow; ++i) {
+      xgboost::bst_ulong nelem = 0;
+      for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
+        if (common::CheckNAN(data[ncol*i + j]) && !nan_missing) {
+          badnan[ithread] = 1;
+        } else if (common::CheckNAN(data[ncol * i + j])) {
+        } else if (nan_missing || data[ncol * i + j] != missing) {
+          ++nelem;
+        }
+      }
+      mat.row_ptr_[i+1] = nelem;
+    }
+  }
+  // Inform about any NaNs and resize data matrix
+  for (int i = 0; i < nthread; i++) {
+    CHECK(!badnan[i]) << "There are NAN in the matrix, however, you did not set missing=NAN";
+  }
+
+  // do cumulative sum (to avoid otherwise need to copy)
+  prefixsum_inplace(&mat.row_ptr_[0], mat.row_ptr_.size());
+  mat.row_data_.resize(mat.row_data_.size() + mat.row_ptr_.back());
+
+  // Fill data matrix (now that know size, no need for slow push_back())
+#pragma omp parallel num_threads(nthread)
+  {
+#pragma omp for schedule(static)
+    for (omp_ulong i = 0; i < nrow; ++i) {
+      xgboost::bst_ulong matj = 0;
+      for (xgboost::bst_ulong j = 0; j < ncol; ++j) {
+        if (common::CheckNAN(data[ncol * i + j])) {
+        } else if (nan_missing || data[ncol * i + j] != missing) {
+          mat.row_data_[mat.row_ptr_[i] + matj] =
+              RowBatch::Entry(j, data[ncol * i + j]);
+          ++matj;
+        }
+      }
+    }
+  }
+
  mat.info.num_nonzero = mat.row_data_.size();
  *out  = new std::shared_ptr<DMatrix>(DMatrix::Create(std::move(source)));
  API_END();