xgboost/tests/cpp/test_serialization.cc

// Copyright (c) 2019-2020 by Contributors
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <string>
#include <xgboost/learner.h>
#include <xgboost/data.h>
#include <xgboost/base.h>
#include "helpers.h"
#include "../../src/common/io.h"
#include "../../src/common/random.h"

namespace xgboost {

void CompareJSON(Json l, Json r) {
  switch (l.GetValue().Type()) {
  case Value::ValueKind::kString: {
    ASSERT_EQ(l, r);
    break;
  }
  case Value::ValueKind::kNumber: {
    ASSERT_NEAR(get<Number>(l), get<Number>(r), kRtEps);
    break;
  }
  case Value::ValueKind::kInteger: {
    ASSERT_EQ(l, r);
    break;
  }
  case Value::ValueKind::kObject: {
    auto const &l_obj = get<Object const>(l);
    auto const &r_obj = get<Object const>(r);
    ASSERT_EQ(l_obj.size(), r_obj.size());

    for (auto const& kv : l_obj) {
      ASSERT_NE(r_obj.find(kv.first), r_obj.cend());
      CompareJSON(l_obj.at(kv.first), r_obj.at(kv.first));
    }
    break;
  }
  case Value::ValueKind::kArray: {
    auto const& l_arr = get<Array const>(l);
    auto const& r_arr = get<Array const>(r);
    ASSERT_EQ(l_arr.size(), r_arr.size());
    for (size_t i = 0; i < l_arr.size(); ++i) {
      CompareJSON(l_arr[i], r_arr[i]);
    }
    break;
  }
  case Value::ValueKind::kBoolean: {
    ASSERT_EQ(l, r);
    break;
  }
  case Value::ValueKind::kNull: {
    ASSERT_EQ(l, r);
    break;
  }
  }
}

void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr<DMatrix> p_dmat) {
  for (auto& batch : p_dmat->GetBatches<SparsePage>()) {
    batch.data.HostVector();
    batch.offset.HostVector();
  }

  int32_t constexpr kIters = 2;

  dmlc::TemporaryDirectory tempdir;
  std::string const fname = tempdir.path + "/model";

  std::vector<std::string> dumped_0;
  std::string model_at_kiter;

  // Train for kIters.
  {
    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
    std::unique_ptr<Learner> learner {Learner::Create({p_dmat})};
    learner->SetParams(args);
    for (int32_t iter = 0; iter < kIters; ++iter) {
      learner->UpdateOneIter(iter, p_dmat);
    }
    dumped_0 = learner->DumpModel(fmap, true, "json");
    learner->Save(fo.get());

    common::MemoryBufferStream mem_out(&model_at_kiter);
    learner->Save(&mem_out);
  }

  // Assert dumped model is same after loading
  std::vector<std::string> dumped_1;
  {
    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
    std::unique_ptr<Learner> learner {Learner::Create({p_dmat})};
    learner->Load(fi.get());
    learner->Configure();
    dumped_1 = learner->DumpModel(fmap, true, "json");
  }
  ASSERT_EQ(dumped_0, dumped_1);

  std::string model_at_2kiter;

  // Test training continuation with data from host
  {
    std::string continued_model;
    {
      // Continue the previous training with another kIters
      std::unique_ptr<dmlc::Stream> fi(
          dmlc::Stream::Create(fname.c_str(), "r"));
      std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
      learner->Load(fi.get());
      learner->Configure();

      // verify the loaded model doesn't change.
      std::string serialised_model_tmp;
      common::MemoryBufferStream mem_out(&serialised_model_tmp);
      learner->Save(&mem_out);
      ASSERT_EQ(model_at_kiter, serialised_model_tmp);

      for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
        batch.data.HostVector();
        batch.offset.HostVector();
      }

      for (int32_t iter = kIters; iter < 2 * kIters; ++iter) {
        learner->UpdateOneIter(iter, p_dmat);
      }
      common::MemoryBufferStream fo(&continued_model);
      learner->Save(&fo);
    }

    {
      // Train 2 * kIters in one go
      std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
      learner->SetParams(args);
      for (int32_t iter = 0; iter < 2 * kIters; ++iter) {
        learner->UpdateOneIter(iter, p_dmat);

        // Verify model is same at the same iteration during two training
        // sessions.
        if (iter == kIters - 1) {
          std::string reproduced_model;
          common::MemoryBufferStream fo(&reproduced_model);
          learner->Save(&fo);
          ASSERT_EQ(model_at_kiter, reproduced_model);
        }
      }
      common::MemoryBufferStream fo(&model_at_2kiter);
      learner->Save(&fo);
    }

    Json m_0 = Json::Load(StringView{continued_model.c_str(), continued_model.size()});
    Json m_1 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
    CompareJSON(m_0, m_1);
  }

  // Test training continuation with data from device.
  {
    // Continue the previous training but on data from device.
    std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r"));
    std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
    learner->Load(fi.get());
    learner->Configure();

    // verify the loaded model doesn't change.
    std::string serialised_model_tmp;
    common::MemoryBufferStream mem_out(&serialised_model_tmp);
    learner->Save(&mem_out);
    ASSERT_EQ(model_at_kiter, serialised_model_tmp);

    learner->SetParam("gpu_id", "0");
    // Pull data to device
    for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
      batch.data.SetDevice(0);
      batch.data.DeviceSpan();
      batch.offset.SetDevice(0);
      batch.offset.DeviceSpan();
    }

    for (int32_t iter = kIters; iter < 2 * kIters; ++iter) {
      learner->UpdateOneIter(iter, p_dmat);
    }
    serialised_model_tmp = std::string{};
    common::MemoryBufferStream fo(&serialised_model_tmp);
    learner->Save(&fo);

    Json m_0 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()});
    Json m_1 = Json::Load(StringView{serialised_model_tmp.c_str(), serialised_model_tmp.size()});
    // GPU ID is changed as data is coming from device.
    ASSERT_EQ(get<Object>(m_0["Config"]["learner"]["generic_param"]).erase("gpu_id"),
              get<Object>(m_1["Config"]["learner"]["generic_param"]).erase("gpu_id"));
  }
}

// Binary is not tested, as it is NOT reproducible.
class SerializationTest : public ::testing::Test {
 protected:
  size_t constexpr static kRows = 15;
  size_t constexpr static kCols = 15;
  std::shared_ptr<DMatrix> p_dmat_;
  FeatureMap fmap_;

 protected:
  ~SerializationTest() override = default;
  void SetUp() override {
    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix();

    p_dmat_->Info().labels_.Resize(kRows);
    auto &h_labels = p_dmat_->Info().labels_.HostVector();

    xgboost::SimpleLCG gen(0);
    SimpleRealUniformDistribution<float> dis(0.0f, 1.0f);

    for (auto& v : h_labels) { v = dis(&gen); }

    for (size_t i = 0; i < kCols; ++i) {
      std::string name = "feat_" + std::to_string(i);
      fmap_.PushBack(i, name.c_str(), "q");
    }
  }
};

TEST_F(SerializationTest, Exact) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"base_score", "3.14195265"},
                            {"max_depth", "2"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"base_score", "3.14195265"},
                            {"max_depth", "2"},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"base_score", "3.14195265"},
                            {"max_depth", "2"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);
}

TEST_F(SerializationTest, Approx) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);
}

TEST_F(SerializationTest, Hist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);
}

TEST_F(SerializationTest, CPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "coord_descent"}},
                           fmap_, p_dmat_);
}

#if defined(XGBOOST_USE_CUDA)
TEST_F(SerializationTest, GpuHist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);
}

TEST_F(SerializationTest, ConfigurationCount) {
  auto& p_dmat = p_dmat_;
  std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_dmat};

  xgboost::ConsoleLogger::Configure({{"verbosity", "3"}});

  testing::internal::CaptureStderr();

  std::string model_str;
  {
    auto learner = std::unique_ptr<Learner>(Learner::Create(mat));

    learner->SetParam("tree_method", "gpu_hist");
    learner->SetParam("enable_experimental_json_serialization", "1");

    for (size_t i = 0; i < 10; ++i) {
      learner->UpdateOneIter(i, p_dmat);
    }
    common::MemoryBufferStream fo(&model_str);
    learner->Save(&fo);
  }

  {
    common::MemoryBufferStream fi(&model_str);
    auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
    learner->Load(&fi);
    for (size_t i = 0; i < 10; ++i) {
      learner->UpdateOneIter(i, p_dmat);
    }
  }

  std::string output = testing::internal::GetCapturedStderr();
  std::string target = "[GPU Hist]: Configure";
  ASSERT_NE(output.find(target), std::string::npos);

  size_t occureences = 0;
  size_t pos = 0;
  // Should run configuration exactly 2 times, one for each learner.
  while ((pos = output.find("[GPU Hist]: Configure", pos)) != std::string::npos) {
    occureences ++;
    pos += target.size();
  }
  ASSERT_EQ(occureences, 2ul);

  xgboost::ConsoleLogger::Configure({{"verbosity", "2"}});
}

TEST_F(SerializationTest, GPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "gpu_coord_descent"}},
                           fmap_, p_dmat_);
}
#endif  // defined(XGBOOST_USE_CUDA)


class LogitSerializationTest : public SerializationTest {
 protected:
  void SetUp() override {
    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix();

    std::shared_ptr<DMatrix> p_dmat{p_dmat_};
    p_dmat->Info().labels_.Resize(kRows);
    auto &h_labels = p_dmat->Info().labels_.HostVector();

    std::bernoulli_distribution flip(0.5);
    auto& rnd = common::GlobalRandom();
    rnd.seed(0);

    for (auto& v : h_labels) { v = flip(rnd); }

    for (size_t i = 0; i < kCols; ++i) {
      std::string name = "feat_" + std::to_string(i);
      fmap_.PushBack(i, name.c_str(), "q");
    }
  }
};

TEST_F(LogitSerializationTest, Exact) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);
}

TEST_F(LogitSerializationTest, Approx) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);
}

TEST_F(LogitSerializationTest, Hist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);
}

TEST_F(LogitSerializationTest, CPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "coord_descent"}},
                           fmap_, p_dmat_);
}

#if defined(XGBOOST_USE_CUDA)
TEST_F(LogitSerializationTest, GpuHist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", "2"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);
}

TEST_F(LogitSerializationTest, GPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"objective", "binary:logistic"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "gpu_coord_descent"}},
                           fmap_, p_dmat_);
}
#endif  // defined(XGBOOST_USE_CUDA)

class MultiClassesSerializationTest : public SerializationTest {
 protected:
  size_t constexpr static kClasses = 4;

  void SetUp() override {
    p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix();

    std::shared_ptr<DMatrix> p_dmat{p_dmat_};
    p_dmat->Info().labels_.Resize(kRows);
    auto &h_labels = p_dmat->Info().labels_.HostVector();

    std::uniform_int_distribution<size_t> categorical(0, kClasses - 1);
    auto& rnd = common::GlobalRandom();
    rnd.seed(0);

    for (auto& v : h_labels) { v = categorical(rnd); }

    for (size_t i = 0; i < kCols; ++i) {
      std::string name = "feat_" + std::to_string(i);
      fmap_.PushBack(i, name.c_str(), "q");
    }
  }
};

TEST_F(MultiClassesSerializationTest, Exact) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "exact"}},
                           fmap_, p_dmat_);
}

TEST_F(MultiClassesSerializationTest, Approx) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "approx"}},
                           fmap_, p_dmat_);
}

TEST_F(MultiClassesSerializationTest, Hist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"num_parallel_tree", "4"},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "hist"}},
                           fmap_, p_dmat_);
}

TEST_F(MultiClassesSerializationTest, CPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "coord_descent"}},
                           fmap_, p_dmat_);
}

#if defined(XGBOOST_USE_CUDA)
TEST_F(MultiClassesSerializationTest, GpuHist) {
  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            // Somehow rebuilding the cache can generate slightly
                            // different result (1e-7) with CPU predictor for some
                            // entries.
                            {"predictor", "gpu_predictor"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "gbtree"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            // GPU_Hist has higher floating point error. 1e-6 doesn't work
                            // after num_parallel_tree goes to 4
                            {"num_parallel_tree", "3"},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);

  TestLearnerSerialization({{"booster", "dart"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"max_depth", std::to_string(kClasses)},
                            {"tree_method", "gpu_hist"}},
                           fmap_, p_dmat_);
}

TEST_F(MultiClassesSerializationTest, GPUCoordDescent) {
  TestLearnerSerialization({{"booster", "gblinear"},
                            {"num_class", std::to_string(kClasses)},
                            {"seed", "0"},
                            {"nthread", "1"},
                            {"updater", "gpu_coord_descent"}},
                           fmap_, p_dmat_);
}
#endif  // defined(XGBOOST_USE_CUDA)
}       // namespace xgboost