/*! * Copyright 2016-2020 XGBoost contributors */ #include #include #include #include #include #include #include #include #include #include #include #include "helpers.h" #include "xgboost/c_api.h" #include "../../src/data/adapter.h" #include "../../src/gbm/gbtree_model.h" #include "xgboost/predictor.h" bool FileExists(const std::string& filename) { struct stat st; return stat(filename.c_str(), &st) == 0; } int64_t GetFileSize(const std::string& filename) { struct stat st; stat(filename.c_str(), &st); return st.st_size; } void CreateSimpleTestData(const std::string& filename) { CreateBigTestData(filename, 6); } void CreateBigTestData(const std::string& filename, size_t n_entries) { std::ofstream fo(filename.c_str()); const size_t entries_per_row = 3; size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row; for (size_t i = 0; i < n_rows; ++i) { const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n"; fo << i << row; } } void CheckObjFunctionImpl(std::unique_ptr const& obj, std::vector preds, std::vector labels, std::vector weights, xgboost::MetaInfo const& info, std::vector out_grad, std::vector out_hess) { xgboost::HostDeviceVector in_preds(preds); xgboost::HostDeviceVector out_gpair; obj->GetGradient(in_preds, info, 1, &out_gpair); std::vector& gpair = out_gpair.HostVector(); ASSERT_EQ(gpair.size(), in_preds.Size()); for (int i = 0; i < static_cast(gpair.size()); ++i) { EXPECT_NEAR(gpair[i].GetGrad(), out_grad[i], 0.01) << "Unexpected grad for pred=" << preds[i] << " label=" << labels[i] << " weight=" << weights[i]; EXPECT_NEAR(gpair[i].GetHess(), out_hess[i], 0.01) << "Unexpected hess for pred=" << preds[i] << " label=" << labels[i] << " weight=" << weights[i]; } } void CheckObjFunction(std::unique_ptr const& obj, std::vector preds, std::vector labels, std::vector weights, std::vector out_grad, std::vector out_hess) { xgboost::MetaInfo info; info.num_row_ = labels.size(); info.labels_.HostVector() = labels; info.weights_.HostVector() = weights; CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); } xgboost::Json CheckConfigReloadImpl(xgboost::Configurable* const configurable, std::string name) { xgboost::Json config_0 { xgboost::Object() }; configurable->SaveConfig(&config_0); configurable->LoadConfig(config_0); xgboost::Json config_1 { xgboost::Object() }; configurable->SaveConfig(&config_1); std::string str_0, str_1; xgboost::Json::Dump(config_0, &str_0); xgboost::Json::Dump(config_1, &str_1); EXPECT_EQ(str_0, str_1); if (name != "") { EXPECT_EQ(xgboost::get(config_1["name"]), name); } return config_1; } void CheckRankingObjFunction(std::unique_ptr const& obj, std::vector preds, std::vector labels, std::vector weights, std::vector groups, std::vector out_grad, std::vector out_hess) { xgboost::MetaInfo info; info.num_row_ = labels.size(); info.labels_.HostVector() = labels; info.weights_.HostVector() = weights; info.group_ptr_ = groups; CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess); } xgboost::bst_float GetMetricEval(xgboost::Metric * metric, xgboost::HostDeviceVector preds, std::vector labels, std::vector weights, std::vector groups) { xgboost::MetaInfo info; info.num_row_ = labels.size(); info.labels_.HostVector() = labels; info.weights_.HostVector() = weights; info.group_ptr_ = groups; return metric->Eval(preds, info, false); } namespace xgboost { bool IsNear(std::vector::const_iterator _beg1, std::vector::const_iterator _end1, std::vector::const_iterator _beg2) { for (auto iter1 = _beg1, iter2 = _beg2; iter1 != _end1; ++iter1, ++iter2) { if (std::abs(*iter1 - *iter2) > xgboost::kRtEps){ return false; } } return true; } SimpleLCG::StateType SimpleLCG::operator()() { state_ = (alpha_ * state_) % mod_; return state_; } SimpleLCG::StateType SimpleLCG::Min() const { return seed_ * alpha_; } SimpleLCG::StateType SimpleLCG::Max() const { return max_value_; } void RandomDataGenerator::GenerateDense(HostDeviceVector *out) const { xgboost::SimpleRealUniformDistribution dist(lower_, upper_); CHECK(out); SimpleLCG lcg{lcg_}; out->Resize(rows_ * cols_, 0); auto &h_data = out->HostVector(); float sparsity = sparsity_ * (upper_ - lower_) + lower_; for (auto &v : h_data) { auto g = dist(&lcg); if (g < sparsity) { v = std::numeric_limits::quiet_NaN(); } else { v = dist(&lcg); } } if (device_ >= 0) { out->SetDevice(device_); out->DeviceSpan(); } } Json RandomDataGenerator::ArrayInterfaceImpl(HostDeviceVector *storage, size_t rows, size_t cols) const { this->GenerateDense(storage); Json array_interface {Object()}; array_interface["data"] = std::vector(2); if (storage->DeviceCanRead()) { array_interface["data"][0] = Integer(reinterpret_cast(storage->ConstDevicePointer())); } else { array_interface["data"][0] = Integer(reinterpret_cast(storage->ConstHostPointer())); } array_interface["data"][1] = Boolean(false); array_interface["shape"] = std::vector(2); array_interface["shape"][0] = rows; array_interface["shape"][1] = cols; array_interface["typestr"] = String(" *storage) const { auto array_interface = this->ArrayInterfaceImpl(storage, rows_, cols_); std::string out; Json::Dump(array_interface, &out); return out; } std::pair, std::string> RandomDataGenerator::GenerateArrayInterfaceBatch( HostDeviceVector *storage, size_t batches) const { this->GenerateDense(storage); std::vector result(batches); std::vector objects; size_t const rows_per_batch = rows_ / batches; auto make_interface = [storage, this](size_t offset, size_t rows) { Json array_interface{Object()}; array_interface["data"] = std::vector(2); if (device_ >= 0) { array_interface["data"][0] = Integer(reinterpret_cast(storage->DevicePointer() + offset)); } else { array_interface["data"][0] = Integer(reinterpret_cast(storage->HostPointer() + offset)); } array_interface["data"][1] = Boolean(false); array_interface["shape"] = std::vector(2); array_interface["shape"][0] = rows; array_interface["shape"][1] = cols_; array_interface["typestr"] = String("> *data) const { CHECK(data); CHECK_EQ(data->size(), cols_); auto& storage = *data; Json arr { Array() }; for (size_t i = 0; i < cols_; ++i) { auto column = this->ArrayInterfaceImpl(&storage[i], rows_, 1); get(arr).emplace_back(column); } std::string out; Json::Dump(arr, &out); return out; } void RandomDataGenerator::GenerateCSR( HostDeviceVector* value, HostDeviceVector* row_ptr, HostDeviceVector* columns) const { auto& h_value = value->HostVector(); auto& h_rptr = row_ptr->HostVector(); auto& h_cols = columns->HostVector(); SimpleLCG lcg{lcg_}; xgboost::SimpleRealUniformDistribution dist(lower_, upper_); float sparsity = sparsity_ * (upper_ - lower_) + lower_; h_rptr.emplace_back(0); for (size_t i = 0; i < rows_; ++i) { size_t rptr = h_rptr.back(); for (size_t j = 0; j < cols_; ++j) { auto g = dist(&lcg); if (g >= sparsity) { g = dist(&lcg); h_value.emplace_back(g); rptr++; h_cols.emplace_back(j); } } h_rptr.emplace_back(rptr); } if (device_ >= 0) { value->SetDevice(device_); value->DeviceSpan(); row_ptr->SetDevice(device_); row_ptr->DeviceSpan(); columns->SetDevice(device_); columns->DeviceSpan(); } CHECK_LE(h_value.size(), rows_ * cols_); CHECK_EQ(value->Size(), h_rptr.back()); CHECK_EQ(columns->Size(), value->Size()); } std::shared_ptr RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label, size_t classes) const { HostDeviceVector data; HostDeviceVector rptrs; HostDeviceVector columns; this->GenerateCSR(&data, &rptrs, &columns); data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), rows_, data.Size(), cols_); std::shared_ptr out{ DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 1)}; if (with_label) { RandomDataGenerator gen(rows_, 1, 0); if (!float_label) { gen.Lower(0).Upper(classes).GenerateDense(&out->Info().labels_); auto& h_labels = out->Info().labels_.HostVector(); for (auto& v : h_labels) { v = static_cast(static_cast(v)); } } else { gen.GenerateDense(&out->Info().labels_); } } return out; } std::unique_ptr CreateSparsePageDMatrix( size_t n_entries, size_t page_size, std::string tmp_file) { // Create sufficiently large data to make two row pages CreateBigTestData(tmp_file, n_entries); std::unique_ptr dmat { DMatrix::Load( tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)}; EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page")); // Loop over the batches and count the records int64_t batch_count = 0; int64_t row_count = 0; for (const auto &batch : dmat->GetBatches()) { batch_count++; row_count += batch.Size(); } #if defined(_OPENMP) EXPECT_GE(batch_count, 2); EXPECT_EQ(row_count, dmat->Info().num_row_); #else #warning "External memory doesn't work with Non-OpenMP build " #endif // defined(_OPENMP) return dmat; } std::unique_ptr CreateSparsePageDMatrixWithRC( size_t n_rows, size_t n_cols, size_t page_size, bool deterministic, const dmlc::TemporaryDirectory& tempdir) { if (!n_rows || !n_cols) { return nullptr; } // Create the svm file in a temp dir const std::string tmp_file = tempdir.path + "/big.libsvm"; std::ofstream fo(tmp_file.c_str()); size_t cols_per_row = ((std::max(n_rows, n_cols) - 1) / std::min(n_rows, n_cols)) + 1; int64_t rem_cols = n_cols; size_t col_idx = 0; // Random feature id generator std::random_device rdev; std::unique_ptr gen; if (deterministic) { // Seed it with a constant value for this configuration - without getting too fancy // like ordered pairing functions and its likes to make it truely unique gen.reset(new std::mt19937(n_rows * n_cols)); } else { gen.reset(new std::mt19937(rdev())); } std::uniform_int_distribution label(0, 1); std::uniform_int_distribution dis(1, n_cols); for (size_t i = 0; i < n_rows; ++i) { // Make sure that all cols are slotted in the first few rows; randomly distribute the // rest std::stringstream row_data; size_t j = 0; if (rem_cols > 0) { for (; j < std::min(static_cast(rem_cols), cols_per_row); ++j) { row_data << label(*gen) << " " << (col_idx + j) << ":" << (col_idx + j + 1) * 10 * i; } rem_cols -= cols_per_row; } else { // Take some random number of colums in [1, n_cols] and slot them here std::vector random_columns; size_t ncols = dis(*gen); for (; j < ncols; ++j) { size_t fid = (col_idx + j) % n_cols; random_columns.push_back(fid); } std::sort(random_columns.begin(), random_columns.end()); for (auto fid : random_columns) { row_data << label(*gen) << " " << fid << ":" << (fid + 1) * 10 * i; } } col_idx += j; fo << row_data.str() << "\n"; } fo.close(); std::string uri = tmp_file; if (page_size > 0) { uri += "#" + tmp_file + ".cache"; } std::unique_ptr dmat( DMatrix::Load(uri, true, false, "auto", page_size)); return dmat; } gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, size_t n_classes) { gbm::GBTreeModel model(param); for (size_t i = 0; i < n_classes; ++i) { std::vector> trees; trees.push_back(std::unique_ptr(new RegTree)); if (i == 0) { (*trees.back())[0].SetLeaf(1.5f); (*trees.back()).Stat(0).sum_hess = 1.0f; } model.CommitModel(std::move(trees), i); } return model; } std::unique_ptr CreateTrainedGBM( std::string name, Args kwargs, size_t kRows, size_t kCols, LearnerModelParam const* learner_model_param, GenericParameter const* generic_param) { auto caches = std::make_shared< PredictionContainer >();; std::unique_ptr gbm { GradientBooster::Create(name, generic_param, learner_model_param)}; gbm->Configure(kwargs); auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(); std::vector labels(kRows); for (size_t i = 0; i < kRows; ++i) { labels[i] = i; } p_dmat->Info().labels_.HostVector() = labels; HostDeviceVector gpair; auto& h_gpair = gpair.HostVector(); h_gpair.resize(kRows); for (size_t i = 0; i < kRows; ++i) { h_gpair[i] = {static_cast(i), 1}; } PredictionCacheEntry predts; gbm->DoBoost(p_dmat.get(), &gpair, &predts); return gbm; } } // namespace xgboost