Use weakref instead of id for DataIter cache. (#9445)
- Fix case where Python reuses id from freed objects. - Small optimization to column matrix with QDM by using `realloc` instead of copying data.
This commit is contained in:
@@ -2,15 +2,26 @@
|
||||
* Copyright 2018-2023 by XGBoost Contributors
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
#include <xgboost/base.h> // for bst_bin_t
|
||||
#include <xgboost/context.h> // for Context
|
||||
#include <xgboost/data.h> // for BatchIterator, BatchSet, DMatrix, Met...
|
||||
|
||||
#include "../../../src/common/column_matrix.h"
|
||||
#include "../helpers.h"
|
||||
#include <cstddef> // for size_t
|
||||
#include <cstdint> // for int32_t, uint16_t, uint8_t
|
||||
#include <limits> // for numeric_limits
|
||||
#include <memory> // for shared_ptr, __shared_ptr_access, allo...
|
||||
#include <type_traits> // for remove_reference_t
|
||||
|
||||
#include "../../../src/common/column_matrix.h" // for ColumnMatrix, Column, DenseColumnIter
|
||||
#include "../../../src/common/hist_util.h" // for DispatchBinType, BinTypeSize, Index
|
||||
#include "../../../src/common/ref_resource_view.h" // for RefResourceView
|
||||
#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix
|
||||
#include "../../../src/data/iterative_dmatrix.h" // for IterativeDMatrix
|
||||
#include "../../../src/tree/param.h" // for TrainParam
|
||||
#include "../helpers.h" // for RandomDataGenerator, NumpyArrayIterFo...
|
||||
|
||||
namespace xgboost {
|
||||
namespace common {
|
||||
|
||||
TEST(DenseColumn, Test) {
|
||||
namespace xgboost::common {
|
||||
TEST(ColumnMatrix, Basic) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
@@ -22,7 +33,7 @@ TEST(DenseColumn, Test) {
|
||||
GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
|
||||
ColumnMatrix column_matrix;
|
||||
for (auto const& page : dmat->GetBatches<SparsePage>()) {
|
||||
column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
|
||||
column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads());
|
||||
}
|
||||
ASSERT_GE(column_matrix.GetTypeSize(), last);
|
||||
ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize);
|
||||
@@ -59,7 +70,7 @@ void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatr
|
||||
}
|
||||
}
|
||||
|
||||
TEST(SparseColumn, Test) {
|
||||
TEST(ColumnMatrix, SparseColumn) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
@@ -69,7 +80,7 @@ TEST(SparseColumn, Test) {
|
||||
GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
|
||||
ColumnMatrix column_matrix;
|
||||
for (auto const& page : dmat->GetBatches<SparsePage>()) {
|
||||
column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
|
||||
column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads());
|
||||
}
|
||||
common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
|
||||
using T = decltype(dtype);
|
||||
@@ -90,7 +101,7 @@ void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DenseColumnWithMissing, Test) {
|
||||
TEST(ColumnMatrix, DenseColumnWithMissing) {
|
||||
int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
|
||||
static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
|
||||
@@ -100,7 +111,7 @@ TEST(DenseColumnWithMissing, Test) {
|
||||
GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
|
||||
ColumnMatrix column_matrix;
|
||||
for (auto const& page : dmat->GetBatches<SparsePage>()) {
|
||||
column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
|
||||
column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads());
|
||||
}
|
||||
ASSERT_TRUE(column_matrix.AnyMissing());
|
||||
DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
|
||||
@@ -110,5 +121,29 @@ TEST(DenseColumnWithMissing, Test) {
|
||||
});
|
||||
}
|
||||
}
|
||||
} // namespace common
|
||||
} // namespace xgboost
|
||||
|
||||
TEST(ColumnMatrix, GrowMissing) {
|
||||
float sparsity = 0.5;
|
||||
NumpyArrayIterForTest iter(sparsity);
|
||||
auto n_threads = 0;
|
||||
bst_bin_t n_bins = 16;
|
||||
BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()};
|
||||
Context ctx;
|
||||
auto m = std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
|
||||
std::numeric_limits<float>::quiet_NaN(),
|
||||
n_threads, n_bins);
|
||||
for (auto const& page : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
|
||||
auto const& column_matrix = page.Transpose();
|
||||
auto const& missing = column_matrix.Missing();
|
||||
auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols();
|
||||
auto expected = std::remove_reference_t<decltype(missing)>::BitFieldT::ComputeStorageSize(n);
|
||||
auto got = missing.storage.size();
|
||||
ASSERT_EQ(expected, got);
|
||||
DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
|
||||
using T = decltype(dtype);
|
||||
auto col = column_matrix.DenseColumn<T, true>(0);
|
||||
CheckColumWithMissingValue(col, page);
|
||||
});
|
||||
}
|
||||
}
|
||||
} // namespace xgboost::common
|
||||
|
||||
@@ -119,6 +119,20 @@ TEST(IO, Resource) {
|
||||
for (std::size_t i = n; i < 2 * n; ++i) {
|
||||
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
|
||||
}
|
||||
|
||||
ptr = malloc_resource->DataAs<std::uint8_t>();
|
||||
std::fill_n(ptr, malloc_resource->Size(), 7);
|
||||
if (force_malloc) {
|
||||
malloc_resource->Resize<true>(n * 3, std::byte{3});
|
||||
} else {
|
||||
malloc_resource->Resize<false>(n * 3, std::byte{3});
|
||||
}
|
||||
for (std::size_t i = 0; i < n * 2; ++i) {
|
||||
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 7);
|
||||
}
|
||||
for (std::size_t i = n * 2; i < n * 3; ++i) {
|
||||
ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 3);
|
||||
}
|
||||
};
|
||||
test_malloc_resize(true);
|
||||
test_malloc_resize(false);
|
||||
|
||||
@@ -12,8 +12,7 @@
|
||||
#include "../helpers.h"
|
||||
#include "xgboost/data.h" // DMatrix
|
||||
|
||||
namespace xgboost {
|
||||
namespace data {
|
||||
namespace xgboost::data {
|
||||
TEST(IterativeDMatrix, Ref) {
|
||||
Context ctx;
|
||||
TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
|
||||
@@ -21,7 +20,7 @@ TEST(IterativeDMatrix, Ref) {
|
||||
}
|
||||
|
||||
TEST(IterativeDMatrix, IsDense) {
|
||||
int n_bins = 16;
|
||||
bst_bin_t n_bins = 16;
|
||||
auto test = [n_bins](float sparsity) {
|
||||
NumpyArrayIterForTest iter(sparsity);
|
||||
auto n_threads = 0;
|
||||
@@ -38,5 +37,4 @@ TEST(IterativeDMatrix, IsDense) {
|
||||
test(0.1);
|
||||
test(1.0);
|
||||
}
|
||||
} // namespace data
|
||||
} // namespace xgboost
|
||||
} // namespace xgboost::data
|
||||
|
||||
Reference in New Issue
Block a user