xgboost/tests/cpp/data/test_data.cc
Jiaming Yuan d9a47794a5 Fix CPU hist init for sparse dataset. (#4625)
* Fix CPU hist init for sparse dataset.

* Implement sparse histogram cut.
* Allow empty features.

* Fix windows build, don't use sparse in distributed environment.

* Comments.

* Smaller threshold.

* Fix windows omp.

* Fix msvc lambda capture.

* Fix MSVC macro.

* Fix MSVC initialization list.

* Fix MSVC initialization list x2.

* Preserve categorical feature behavior.

* Rename matrix to sparse cuts.
* Reuse UseGroup.
* Check for categorical data when adding cut.

Co-Authored-By: Philip Hyunsu Cho <chohyu01@cs.washington.edu>

* Sanity check.

* Fix comments.

* Fix comment.
2019-07-04 16:27:03 -07:00

85 lines
2.3 KiB
C++

#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <vector>
#include "xgboost/data.h"
#include "../helpers.h"
namespace xgboost {
TEST(SparsePage, PushCSC) {
std::vector<size_t> offset {0};
std::vector<Entry> data;
SparsePage page;
page.offset.HostVector() = offset;
page.data.HostVector() = data;
offset = {0, 1, 4};
for (size_t i = 0; i < offset.back(); ++i) {
data.emplace_back(Entry(i, 0.1f));
}
SparsePage other;
other.offset.HostVector() = offset;
other.data.HostVector() = data;
page.PushCSC(other);
ASSERT_EQ(page.offset.HostVector().size(), offset.size());
ASSERT_EQ(page.data.HostVector().size(), data.size());
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(page.offset.HostVector()[i], offset[i]);
}
for (size_t i = 0; i < data.size(); ++i) {
ASSERT_EQ(page.data.HostVector()[i].index, data[i].index);
}
page.PushCSC(other);
ASSERT_EQ(page.offset.HostVector().size(), offset.size());
ASSERT_EQ(page.data.Size(), data.size() * 2);
for (size_t i = 0; i < offset.size(); ++i) {
ASSERT_EQ(page.offset.HostVector()[i], offset[i] * 2);
}
auto inst = page[0];
ASSERT_EQ(inst.size(), 2);
for (auto entry : inst) {
ASSERT_EQ(entry.index, 0);
}
inst = page[1];
ASSERT_EQ(inst.size(), 6);
std::vector<size_t> indices_sol {1, 2, 3};
for (int64_t i = 0; i < inst.size(); ++i) {
ASSERT_EQ(inst[i].index, indices_sol[i % 3]);
}
}
TEST(SparsePage, PushCSCAfterTranspose) {
dmlc::TemporaryDirectory tmpdir;
std::string filename = tmpdir.path + "/big.libsvm";
const int n_entries = 9;
std::unique_ptr<DMatrix> dmat =
CreateSparsePageDMatrix(n_entries, 64UL, filename);
const int ncols = dmat->Info().num_col_;
SparsePage page; // Consolidated sparse page
for (const auto &batch : dmat->GetRowBatches()) {
// Transpose each batch and push
SparsePage tmp = batch.GetTranspose(ncols);
page.PushCSC(tmp);
}
// Make sure that the final sparse page has the right number of entries
ASSERT_EQ(n_entries, page.data.Size());
// The feature value for a feature in each row should be identical, as that is
// how the dmatrix has been created
for (size_t i = 0; i < page.Size(); ++i) {
auto inst = page[i];
for (int j = 1; j < inst.size(); ++j) {
ASSERT_EQ(inst[0].fvalue, inst[j].fvalue);
}
}
}
} // namespace xgboost