Add PushCSC for SparsePage. (#4193)

* Add PushCSC for SparsePage.

* Move Push* definitions into cc file.
* Add std:: prefix to `size_t` make clang++ happy.
* Address monitor count == 0.
This commit is contained in:
Jiaming Yuan
2019-03-02 01:58:08 +08:00
committed by GitHub
parent 74009afcac
commit 7ea5675679
8 changed files with 199 additions and 50 deletions

View File

@@ -0,0 +1,55 @@
#include <gtest/gtest.h>
#include <vector>
#include "xgboost/data.h"
namespace xgboost {
TEST(SparsePage, PushCSC) {
std::vector<size_t> offset {0};
std::vector<Entry> data;
SparsePage page;
page.offset.HostVector() = offset;
page.data.HostVector() = data;
offset = {0, 1, 4};
for (size_t i = 0; i < offset.back(); ++i) {
data.push_back(Entry(i, 0.1f));
}
SparsePage other;
other.offset.HostVector() = offset;
other.data.HostVector() = data;
page.PushCSC(other);
ASSERT_EQ(page.offset.HostVector().size(), offset.size());
ASSERT_EQ(page.data.HostVector().size(), data.size());
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(page.offset.HostVector()[i], offset[i]);
}
for (size_t i = 0; i < data.size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].index, data[i].index);
}
page.PushCSC(other);
ASSERT_EQ(page.offset.HostVector().size(), offset.size());
ASSERT_EQ(page.data.Size(), data.size() * 2);
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(page.offset.HostVector()[i], offset[i] * 2);
}
auto inst = page[0];
ASSERT_EQ(inst.size(), 2);
for (auto entry : inst) {
ASSERT_EQ(entry.index, 0);
}
inst = page[1];
ASSERT_EQ(inst.size(), 6);
std::vector<size_t> indices_sol {1, 2, 3};
for (size_t i = 0; i < inst.size(); ++i) {
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
}
}
}

View File

@@ -3,6 +3,7 @@
#include <vector>
#include "helpers.h"
#include "xgboost/learner.h"
#include "dmlc/filesystem.h"
namespace xgboost {
@@ -92,4 +93,26 @@ TEST(Learner, CheckGroup) {
delete pp_mat;
}
TEST(Learner, SLOW_CheckMultiBatch) {
using Arg = std::pair<std::string, std::string>;
// Create sufficiently large data to make two row pages
dmlc::TemporaryDirectory tempdir;
const std::string tmp_file = tempdir.path + "/big.libsvm";
CreateBigTestData(tmp_file, 5000000);
std::shared_ptr<DMatrix> dmat(xgboost::DMatrix::Load( tmp_file + "#" + tmp_file + ".cache", true, false));
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
EXPECT_FALSE(dmat->SingleColBlock());
size_t num_row = dmat->Info().num_row_;
std::vector<bst_float> labels(num_row);
for (size_t i = 0; i < num_row; ++i) {
labels[i] = i % 2;
}
dmat->Info().SetInfo("label", labels.data(), DataType::kFloat32, num_row);
std::vector<std::shared_ptr<DMatrix>> mat{dmat};
auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
learner->Configure({Arg{"objective", "binary:logistic"}});
learner->InitModel();
learner->UpdateOneIter(0, dmat.get());
}
} // namespace xgboost