#include #include #include #include #include #include #include "helpers.h" #include "../../src/common/io.h" #include "../../src/common/random.h" namespace xgboost { void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr p_dmat) { for (auto& batch : p_dmat->GetBatches()) { batch.data.HostVector(); batch.offset.HostVector(); } int32_t constexpr kIters = 2; dmlc::TemporaryDirectory tempdir; std::string const fname = tempdir.path + "/model"; std::vector dumped_0; std::string model_at_kiter; { std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); std::unique_ptr learner {Learner::Create({p_dmat})}; learner->SetParams(args); for (int32_t iter = 0; iter < kIters; ++iter) { learner->UpdateOneIter(iter, p_dmat.get()); } dumped_0 = learner->DumpModel(fmap, true, "json"); learner->Save(fo.get()); common::MemoryBufferStream mem_out(&model_at_kiter); learner->Save(&mem_out); } std::vector dumped_1; { std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); std::unique_ptr learner {Learner::Create({p_dmat})}; learner->Load(fi.get()); learner->Configure(); dumped_1 = learner->DumpModel(fmap, true, "json"); } ASSERT_EQ(dumped_0, dumped_1); std::string model_at_2kiter; // Test training continuation with data from host { std::string continued_model; { // Continue the previous training with another kIters std::unique_ptr fi( dmlc::Stream::Create(fname.c_str(), "r")); std::unique_ptr learner{Learner::Create({p_dmat})}; learner->Load(fi.get()); learner->Configure(); // verify the loaded model doesn't change. std::string serialised_model_tmp; common::MemoryBufferStream mem_out(&serialised_model_tmp); learner->Save(&mem_out); ASSERT_EQ(model_at_kiter, serialised_model_tmp); for (auto &batch : p_dmat->GetBatches()) { batch.data.HostVector(); batch.offset.HostVector(); } for (int32_t iter = kIters; iter < 2 * kIters; ++iter) { learner->UpdateOneIter(iter, p_dmat.get()); } common::MemoryBufferStream fo(&continued_model); learner->Save(&fo); } { // Train 2 * kIters in one go std::unique_ptr learner{Learner::Create({p_dmat})}; learner->SetParams(args); for (int32_t iter = 0; iter < 2 * kIters; ++iter) { learner->UpdateOneIter(iter, p_dmat.get()); // Verify model is same at the same iteration during two training // sessions. if (iter == kIters - 1) { std::string reproduced_model; common::MemoryBufferStream fo(&reproduced_model); learner->Save(&fo); ASSERT_EQ(model_at_kiter, reproduced_model); } } common::MemoryBufferStream fo(&model_at_2kiter); learner->Save(&fo); } Json m_0 = Json::Load(StringView{continued_model.c_str(), continued_model.size()}); Json m_1 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()}); ASSERT_EQ(m_0, m_1); } // Test training continuation with data from device. { // Continue the previous training but on data from device. std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); std::unique_ptr learner{Learner::Create({p_dmat})}; learner->Load(fi.get()); learner->Configure(); // verify the loaded model doesn't change. std::string serialised_model_tmp; common::MemoryBufferStream mem_out(&serialised_model_tmp); learner->Save(&mem_out); ASSERT_EQ(model_at_kiter, serialised_model_tmp); learner->SetParam("gpu_id", "0"); // Pull data to device for (auto &batch : p_dmat->GetBatches()) { batch.data.SetDevice(0); batch.data.DeviceSpan(); batch.offset.SetDevice(0); batch.offset.DeviceSpan(); } for (int32_t iter = kIters; iter < 2 * kIters; ++iter) { learner->UpdateOneIter(iter, p_dmat.get()); } serialised_model_tmp = std::string{}; common::MemoryBufferStream fo(&serialised_model_tmp); learner->Save(&fo); Json m_0 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()}); Json m_1 = Json::Load(StringView{serialised_model_tmp.c_str(), serialised_model_tmp.size()}); // GPU ID is changed as data is coming from device. ASSERT_EQ(get(m_0["Config"]["learner"]["generic_param"]).erase("gpu_id"), get(m_1["Config"]["learner"]["generic_param"]).erase("gpu_id")); } } // Binary is not tested, as it is NOT reproducible. class SerializationTest : public ::testing::Test { protected: size_t constexpr static kRows = 10; size_t constexpr static kCols = 10; std::shared_ptr* pp_dmat_; FeatureMap fmap_; protected: ~SerializationTest() override { delete pp_dmat_; } void SetUp() override { pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); std::shared_ptr p_dmat{*pp_dmat_}; p_dmat->Info().labels_.Resize(kRows); auto &h_labels = p_dmat->Info().labels_.HostVector(); xgboost::SimpleLCG gen(0); SimpleRealUniformDistribution dis(0.0f, 1.0f); for (auto& v : h_labels) { v = dis(&gen); } for (size_t i = 0; i < kCols; ++i) { std::string name = "feat_" + std::to_string(i); fmap_.PushBack(i, name.c_str(), "q"); } } }; TEST_F(SerializationTest, Exact) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); } TEST_F(SerializationTest, Approx) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); } TEST_F(SerializationTest, Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); } TEST_F(SerializationTest, CPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, *pp_dmat_); } #if defined(XGBOOST_USE_CUDA) TEST_F(SerializationTest, GPU_Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"seed", "0"}, {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); } TEST_F(SerializationTest, ConfigurationCount) { auto& p_dmat = *pp_dmat_; std::vector> mat = {p_dmat}; xgboost::ConsoleLogger::Configure({{"verbosity", "3"}}); testing::internal::CaptureStderr(); std::string model_str; { auto learner = std::unique_ptr(Learner::Create(mat)); learner->SetParam("tree_method", "gpu_hist"); learner->SetParam("enable_experimental_json_serialization", "1"); for (size_t i = 0; i < 10; ++i) { learner->UpdateOneIter(i, p_dmat.get()); } common::MemoryBufferStream fo(&model_str); learner->Save(&fo); } { common::MemoryBufferStream fi(&model_str); auto learner = std::unique_ptr(Learner::Create(mat)); learner->Load(&fi); for (size_t i = 0; i < 10; ++i) { learner->UpdateOneIter(i, p_dmat.get()); } } std::string output = testing::internal::GetCapturedStderr(); std::string target = "[GPU Hist]: Configure"; ASSERT_NE(output.find(target), std::string::npos); size_t occureences = 0; size_t pos = 0; // Should run configuration exactly 2 times, one for each learner. while ((pos = output.find("[GPU Hist]: Configure", pos)) != std::string::npos) { occureences ++; pos += target.size(); } ASSERT_EQ(occureences, 2); xgboost::ConsoleLogger::Configure({{"verbosity", "1"}}); } TEST_F(SerializationTest, GPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, *pp_dmat_); } #endif // defined(XGBOOST_USE_CUDA) class LogitSerializationTest : public SerializationTest { protected: void SetUp() override { pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); std::shared_ptr p_dmat{*pp_dmat_}; p_dmat->Info().labels_.Resize(kRows); auto &h_labels = p_dmat->Info().labels_.HostVector(); std::bernoulli_distribution flip(0.5); auto& rnd = common::GlobalRandom(); rnd.seed(0); for (auto& v : h_labels) { v = flip(rnd); } for (size_t i = 0; i < kCols; ++i) { std::string name = "feat_" + std::to_string(i); fmap_.PushBack(i, name.c_str(), "q"); } } }; TEST_F(LogitSerializationTest, Exact) { TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); } TEST_F(LogitSerializationTest, Approx) { TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); } TEST_F(LogitSerializationTest, Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); } TEST_F(LogitSerializationTest, CPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, *pp_dmat_); } #if defined(XGBOOST_USE_CUDA) TEST_F(LogitSerializationTest, GPU_Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); } TEST_F(LogitSerializationTest, GPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, *pp_dmat_); } #endif // defined(XGBOOST_USE_CUDA) class MultiClassesSerializationTest : public SerializationTest { protected: size_t constexpr static kClasses = 4; void SetUp() override { pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); std::shared_ptr p_dmat{*pp_dmat_}; p_dmat->Info().labels_.Resize(kRows); auto &h_labels = p_dmat->Info().labels_.HostVector(); std::uniform_int_distribution categorical(0, kClasses - 1); auto& rnd = common::GlobalRandom(); rnd.seed(0); for (auto& v : h_labels) { v = categorical(rnd); } for (size_t i = 0; i < kCols; ++i) { std::string name = "feat_" + std::to_string(i); fmap_.PushBack(i, name.c_str(), "q"); } } }; TEST_F(MultiClassesSerializationTest, Exact) { TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"num_parallel_tree", "4"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, *pp_dmat_); } TEST_F(MultiClassesSerializationTest, Approx) { TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, *pp_dmat_); } TEST_F(MultiClassesSerializationTest, Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"num_parallel_tree", "4"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, *pp_dmat_); } TEST_F(MultiClassesSerializationTest, CPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, *pp_dmat_); } #if defined(XGBOOST_USE_CUDA) TEST_F(MultiClassesSerializationTest, GPU_Hist) { TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, // GPU_Hist has higher floating point error. 1e-6 doesn't work // after num_parallel_tree goes to 4 {"num_parallel_tree", "3"}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); TestLearnerSerialization({{"booster", "dart"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, *pp_dmat_); } TEST_F(MultiClassesSerializationTest, GPU_CoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, *pp_dmat_); } #endif // defined(XGBOOST_USE_CUDA) } // namespace xgboost