Reducing memory consumption for 'hist' method on CPU (#5334)
This commit is contained in:
@@ -9,28 +9,46 @@ namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(DenseColumn, Test) {
|
||||
auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), 256);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2);
|
||||
uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 10, 0.0).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), max_num_bin);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2);
|
||||
|
||||
for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
|
||||
for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
|
||||
auto col = column_matrix.GetColumn(j);
|
||||
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
|
||||
col.GetGlobalBinIdx(i));
|
||||
for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
|
||||
for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
|
||||
switch (column_matrix.GetTypeSize()) {
|
||||
case UINT8_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint8_t>(j);
|
||||
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
|
||||
(*col.get()).GetGlobalBinIdx(i));
|
||||
}
|
||||
break;
|
||||
case UINT16_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint16_t>(j);
|
||||
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
|
||||
(*col.get()).GetGlobalBinIdx(i));
|
||||
}
|
||||
break;
|
||||
case UINT32_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint32_t>(j);
|
||||
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j],
|
||||
(*col.get()).GetGlobalBinIdx(i));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparseColumn, Test) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), 256);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.5);
|
||||
auto col = column_matrix.GetColumn(0);
|
||||
template<typename BinIdxType>
|
||||
inline void CheckSparseColumn(const Column<BinIdxType>& col_input, const GHistIndexMatrix& gmat) {
|
||||
const SparseColumn<BinIdxType>& col = static_cast<const SparseColumn<BinIdxType>& >(col_input);
|
||||
ASSERT_EQ(col.Size(), gmat.index.size());
|
||||
for (auto i = 0ull; i < col.Size(); i++) {
|
||||
ASSERT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
|
||||
@@ -38,20 +56,77 @@ TEST(SparseColumn, Test) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DenseColumnWithMissing, Test) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), 256);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2);
|
||||
auto col = column_matrix.GetColumn(0);
|
||||
TEST(SparseColumn, Test) {
|
||||
uint64_t max_num_bins[] = {static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.85).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), max_num_bin);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.5);
|
||||
switch (column_matrix.GetTypeSize()) {
|
||||
case UINT8_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint8_t>(0);
|
||||
CheckSparseColumn(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
case UINT16_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint16_t>(0);
|
||||
CheckSparseColumn(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
case UINT32_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint32_t>(0);
|
||||
CheckSparseColumn(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename BinIdxType>
|
||||
inline void CheckColumWithMissingValue(const Column<BinIdxType>& col_input,
|
||||
const GHistIndexMatrix& gmat) {
|
||||
const DenseColumn<BinIdxType>& col = static_cast<const DenseColumn<BinIdxType>& >(col_input);
|
||||
for (auto i = 0ull; i < col.Size(); i++) {
|
||||
if (col.IsMissing(i)) continue;
|
||||
EXPECT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]],
|
||||
EXPECT_EQ(gmat.index[gmat.row_ptr[i]],
|
||||
col.GetGlobalBinIdx(i));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DenseColumnWithMissing, Test) {
|
||||
uint64_t max_num_bins[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
for (size_t max_num_bin : max_num_bins) {
|
||||
auto dmat = RandomDataGenerator(100, 1, 0.5).GenerateDMatix();
|
||||
GHistIndexMatrix gmat;
|
||||
gmat.Init(dmat.get(), max_num_bin);
|
||||
ColumnMatrix column_matrix;
|
||||
column_matrix.Init(gmat, 0.2);
|
||||
switch (column_matrix.GetTypeSize()) {
|
||||
case UINT8_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint8_t>(0);
|
||||
CheckColumWithMissingValue(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
case UINT16_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint16_t>(0);
|
||||
CheckColumWithMissingValue(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
case UINT32_BINS_TYPE_SIZE: {
|
||||
auto col = column_matrix.GetColumn<uint32_t>(0);
|
||||
CheckColumWithMissingValue(*col.get(), gmat);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestGHistIndexMatrixCreation(size_t nthreads) {
|
||||
dmlc::TemporaryDirectory tmpdir;
|
||||
std::string filename = tmpdir.path + "/big.libsvm";
|
||||
|
||||
@@ -347,5 +347,106 @@ TEST(hist_util, SparseCutsExternalMemory) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, IndexBinBound) {
|
||||
uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
BinTypeSize expected_bin_type_sizes[] = {UINT8_BINS_TYPE_SIZE,
|
||||
UINT16_BINS_TYPE_SIZE,
|
||||
UINT32_BINS_TYPE_SIZE};
|
||||
size_t constexpr kRows = 100;
|
||||
size_t constexpr kCols = 10;
|
||||
|
||||
size_t bin_id = 0;
|
||||
for (auto max_bin : bin_sizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
|
||||
|
||||
common::GHistIndexMatrix hmat;
|
||||
hmat.Init(p_fmat.get(), max_bin);
|
||||
EXPECT_EQ(hmat.index.size(), kRows*kCols);
|
||||
EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.getBinTypeSize());
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, SparseIndexBinBound) {
|
||||
uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
BinTypeSize expected_bin_type_sizes[] = { UINT32_BINS_TYPE_SIZE,
|
||||
UINT32_BINS_TYPE_SIZE,
|
||||
UINT32_BINS_TYPE_SIZE };
|
||||
size_t constexpr kRows = 100;
|
||||
size_t constexpr kCols = 10;
|
||||
|
||||
size_t bin_id = 0;
|
||||
for (auto max_bin : bin_sizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatix();
|
||||
common::GHistIndexMatrix hmat;
|
||||
hmat.Init(p_fmat.get(), max_bin);
|
||||
EXPECT_EQ(expected_bin_type_sizes[bin_id++], hmat.index.getBinTypeSize());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CheckIndexData(T* data_ptr, uint32_t* offsets,
|
||||
const common::GHistIndexMatrix& hmat, size_t n_cols) {
|
||||
for (size_t i = 0; i < hmat.index.size(); ++i) {
|
||||
EXPECT_EQ(data_ptr[i] + offsets[i % n_cols], hmat.index[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, IndexBinData) {
|
||||
uint64_t constexpr kBinSizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
size_t constexpr kRows = 100;
|
||||
size_t constexpr kCols = 10;
|
||||
|
||||
size_t bin_id = 0;
|
||||
for (auto max_bin : kBinSizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
|
||||
common::GHistIndexMatrix hmat;
|
||||
hmat.Init(p_fmat.get(), max_bin);
|
||||
uint32_t* offsets = hmat.index.offset();
|
||||
EXPECT_EQ(hmat.index.size(), kRows*kCols);
|
||||
switch (max_bin) {
|
||||
case kBinSizes[0]:
|
||||
CheckIndexData(hmat.index.data<uint8_t>(),
|
||||
offsets, hmat, kCols);
|
||||
break;
|
||||
case kBinSizes[1]:
|
||||
CheckIndexData(hmat.index.data<uint16_t>(),
|
||||
offsets, hmat, kCols);
|
||||
break;
|
||||
case kBinSizes[2]:
|
||||
CheckIndexData(hmat.index.data<uint32_t>(),
|
||||
offsets, hmat, kCols);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(hist_util, SparseIndexBinData) {
|
||||
uint64_t bin_sizes[] = { static_cast<uint64_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<uint64_t>(std::numeric_limits<uint16_t>::max()) + 2 };
|
||||
size_t constexpr kRows = 100;
|
||||
size_t constexpr kCols = 10;
|
||||
|
||||
size_t bin_id = 0;
|
||||
for (auto max_bin : bin_sizes) {
|
||||
auto p_fmat = RandomDataGenerator(kRows, kCols, 0.2).GenerateDMatix();
|
||||
common::GHistIndexMatrix hmat;
|
||||
hmat.Init(p_fmat.get(), max_bin);
|
||||
EXPECT_EQ(hmat.index.offset(), nullptr);
|
||||
|
||||
uint32_t* data_ptr = hmat.index.data<uint32_t>();
|
||||
for (size_t i = 0; i < hmat.index.size(); ++i) {
|
||||
EXPECT_EQ(data_ptr[i], hmat.index[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
Reference in New Issue
Block a user