Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation (#2530)

* Multi-threaded XGDMatrixCreateFromMat for faster DMatrix creation from numpy arrays for python interface.
This commit is contained in:
PSEUDOTENSOR / Jonathan McKinney
2017-07-20 19:43:17 -07:00
committed by Rory Mitchell
parent 56550ff3f1
commit 6b375f6ad8
9 changed files with 324 additions and 73 deletions

View File

@@ -0,0 +1,40 @@
// Copyright by Contributors
#include <gtest/gtest.h>
#include <xgboost/c_api.h>
#include <xgboost/data.h>
TEST(c_api, XGDMatrixCreateFromMat_omp) {
std::vector<int> num_rows = {100, 11374, 15000};
for (auto row : num_rows) {
int num_cols = 50;
int num_missing = 5;
DMatrixHandle handle;
std::vector<float> data(num_cols * row, 1.5);
for (int i = 0; i < num_missing; i++) {
data[i] = std::numeric_limits<float>::quiet_NaN();
}
XGDMatrixCreateFromMat_omp(data.data(), row, num_cols,
std::numeric_limits<float>::quiet_NaN(), &handle,
0);
std::shared_ptr<xgboost::DMatrix> dmat =
*static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
xgboost::MetaInfo &info = dmat->info();
ASSERT_EQ(info.num_col, num_cols);
ASSERT_EQ(info.num_row, row);
ASSERT_EQ(info.num_nonzero, num_cols * row - num_missing);
auto iter = dmat->RowIterator();
iter->BeforeFirst();
while (iter->Next()) {
auto batch = iter->Value();
for (int i = 0; i < batch.size; i++) {
auto inst = batch[i];
for (int j = 0; i < inst.length; i++) {
ASSERT_EQ(inst[j].fvalue, 1.5);
}
}
}
}
}

View File

@@ -212,6 +212,23 @@ class TestBasic(unittest.TestCase):
self.assertRaises(xgb.core.XGBoostError, xgb.Booster,
model_file=u'不正なパス')
def test_dmatrix_numpy_init_omp(self):
rows = [1000, 11326, 15000]
cols = 50
for row in rows:
X = np.random.randn(row, cols)
y = np.random.randn(row).astype('f')
dm = xgb.DMatrix(X, y, nthread=0)
np.testing.assert_array_equal(dm.get_label(), y)
assert dm.num_row() == row
assert dm.num_col() == cols
dm = xgb.DMatrix(X, y, nthread=10)
np.testing.assert_array_equal(dm.get_label(), y)
assert dm.num_row() == row
assert dm.num_col() == cols
def test_dmatrix_numpy_init(self):
data = np.random.randn(5, 5)
dm = xgb.DMatrix(data)