Normal prediction with DMatrix is now thread safe with locks. Added inplace prediction is lock free thread safe. When data is on device (cupy, cudf), the returned data is also on device. * Implementation for numpy, csr, cudf and cupy. * Implementation for dask. * Remove sync in simple dmatrix.
424 lines
14 KiB
C++
424 lines
14 KiB
C++
/*!
|
|
* Copyright 2016-2020 XGBoost contributors
|
|
*/
|
|
#include <dmlc/filesystem.h>
|
|
#include <xgboost/logging.h>
|
|
#include <xgboost/objective.h>
|
|
#include <xgboost/metric.h>
|
|
#include <xgboost/learner.h>
|
|
#include <xgboost/gbm.h>
|
|
#include <xgboost/json.h>
|
|
#include <gtest/gtest.h>
|
|
|
|
#include <algorithm>
|
|
#include <random>
|
|
#include <cinttypes>
|
|
|
|
#include "helpers.h"
|
|
#include "xgboost/c_api.h"
|
|
#include "../../src/data/adapter.h"
|
|
#include "../../src/gbm/gbtree_model.h"
|
|
#include "xgboost/predictor.h"
|
|
|
|
bool FileExists(const std::string& filename) {
|
|
struct stat st;
|
|
return stat(filename.c_str(), &st) == 0;
|
|
}
|
|
|
|
int64_t GetFileSize(const std::string& filename) {
|
|
struct stat st;
|
|
stat(filename.c_str(), &st);
|
|
return st.st_size;
|
|
}
|
|
|
|
void CreateSimpleTestData(const std::string& filename) {
|
|
CreateBigTestData(filename, 6);
|
|
}
|
|
|
|
void CreateBigTestData(const std::string& filename, size_t n_entries) {
|
|
std::ofstream fo(filename.c_str());
|
|
const size_t entries_per_row = 3;
|
|
size_t n_rows = (n_entries + entries_per_row - 1) / entries_per_row;
|
|
for (size_t i = 0; i < n_rows; ++i) {
|
|
const char* row = i % 2 == 0 ? " 0:0 1:10 2:20\n" : " 0:0 3:30 4:40\n";
|
|
fo << i << row;
|
|
}
|
|
}
|
|
|
|
void CheckObjFunctionImpl(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|
std::vector<xgboost::bst_float> preds,
|
|
std::vector<xgboost::bst_float> labels,
|
|
std::vector<xgboost::bst_float> weights,
|
|
xgboost::MetaInfo const& info,
|
|
std::vector<xgboost::bst_float> out_grad,
|
|
std::vector<xgboost::bst_float> out_hess) {
|
|
xgboost::HostDeviceVector<xgboost::bst_float> in_preds(preds);
|
|
xgboost::HostDeviceVector<xgboost::GradientPair> out_gpair;
|
|
obj->GetGradient(in_preds, info, 1, &out_gpair);
|
|
std::vector<xgboost::GradientPair>& gpair = out_gpair.HostVector();
|
|
|
|
ASSERT_EQ(gpair.size(), in_preds.Size());
|
|
for (int i = 0; i < static_cast<int>(gpair.size()); ++i) {
|
|
EXPECT_NEAR(gpair[i].GetGrad(), out_grad[i], 0.01)
|
|
<< "Unexpected grad for pred=" << preds[i] << " label=" << labels[i]
|
|
<< " weight=" << weights[i];
|
|
EXPECT_NEAR(gpair[i].GetHess(), out_hess[i], 0.01)
|
|
<< "Unexpected hess for pred=" << preds[i] << " label=" << labels[i]
|
|
<< " weight=" << weights[i];
|
|
}
|
|
}
|
|
|
|
void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|
std::vector<xgboost::bst_float> preds,
|
|
std::vector<xgboost::bst_float> labels,
|
|
std::vector<xgboost::bst_float> weights,
|
|
std::vector<xgboost::bst_float> out_grad,
|
|
std::vector<xgboost::bst_float> out_hess) {
|
|
xgboost::MetaInfo info;
|
|
info.num_row_ = labels.size();
|
|
info.labels_.HostVector() = labels;
|
|
info.weights_.HostVector() = weights;
|
|
|
|
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
|
}
|
|
|
|
xgboost::Json CheckConfigReloadImpl(xgboost::Configurable* const configurable,
|
|
std::string name) {
|
|
xgboost::Json config_0 { xgboost::Object() };
|
|
configurable->SaveConfig(&config_0);
|
|
configurable->LoadConfig(config_0);
|
|
|
|
xgboost::Json config_1 { xgboost::Object() };
|
|
configurable->SaveConfig(&config_1);
|
|
|
|
std::string str_0, str_1;
|
|
xgboost::Json::Dump(config_0, &str_0);
|
|
xgboost::Json::Dump(config_1, &str_1);
|
|
EXPECT_EQ(str_0, str_1);
|
|
|
|
if (name != "") {
|
|
EXPECT_EQ(xgboost::get<xgboost::String>(config_1["name"]), name);
|
|
}
|
|
return config_1;
|
|
}
|
|
|
|
void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
|
|
std::vector<xgboost::bst_float> preds,
|
|
std::vector<xgboost::bst_float> labels,
|
|
std::vector<xgboost::bst_float> weights,
|
|
std::vector<xgboost::bst_uint> groups,
|
|
std::vector<xgboost::bst_float> out_grad,
|
|
std::vector<xgboost::bst_float> out_hess) {
|
|
xgboost::MetaInfo info;
|
|
info.num_row_ = labels.size();
|
|
info.labels_.HostVector() = labels;
|
|
info.weights_.HostVector() = weights;
|
|
info.group_ptr_ = groups;
|
|
|
|
CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
|
|
}
|
|
|
|
xgboost::bst_float GetMetricEval(xgboost::Metric * metric,
|
|
xgboost::HostDeviceVector<xgboost::bst_float> preds,
|
|
std::vector<xgboost::bst_float> labels,
|
|
std::vector<xgboost::bst_float> weights,
|
|
std::vector<xgboost::bst_uint> groups) {
|
|
xgboost::MetaInfo info;
|
|
info.num_row_ = labels.size();
|
|
info.labels_.HostVector() = labels;
|
|
info.weights_.HostVector() = weights;
|
|
info.group_ptr_ = groups;
|
|
|
|
return metric->Eval(preds, info, false);
|
|
}
|
|
|
|
namespace xgboost {
|
|
bool IsNear(std::vector<xgboost::bst_float>::const_iterator _beg1,
|
|
std::vector<xgboost::bst_float>::const_iterator _end1,
|
|
std::vector<xgboost::bst_float>::const_iterator _beg2) {
|
|
for (auto iter1 = _beg1, iter2 = _beg2; iter1 != _end1; ++iter1, ++iter2) {
|
|
if (std::abs(*iter1 - *iter2) > xgboost::kRtEps){
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
SimpleLCG::StateType SimpleLCG::operator()() {
|
|
state_ = (alpha_ * state_) % mod_;
|
|
return state_;
|
|
}
|
|
SimpleLCG::StateType SimpleLCG::Min() const {
|
|
return seed_ * alpha_;
|
|
}
|
|
SimpleLCG::StateType SimpleLCG::Max() const {
|
|
return max_value_;
|
|
}
|
|
|
|
void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
|
|
SimpleLCG lcg{seed_};
|
|
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
|
CHECK(out);
|
|
|
|
out->Resize(rows_ * cols_, 0);
|
|
auto &h_data = out->HostVector();
|
|
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
|
for (auto &v : h_data) {
|
|
auto g = dist(&lcg);
|
|
if (g < sparsity) {
|
|
v = std::numeric_limits<float>::quiet_NaN();
|
|
} else {
|
|
v = dist(&lcg);
|
|
}
|
|
}
|
|
if (device_ >= 0) {
|
|
out->SetDevice(device_);
|
|
out->DeviceSpan();
|
|
}
|
|
}
|
|
|
|
Json RandomDataGenerator::ArrayInterfaceImpl(HostDeviceVector<float> *storage,
|
|
size_t rows, size_t cols) const {
|
|
this->GenerateDense(storage);
|
|
Json array_interface {Object()};
|
|
array_interface["data"] = std::vector<Json>(2);
|
|
array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(storage->DevicePointer()));
|
|
array_interface["data"][1] = Boolean(false);
|
|
|
|
array_interface["shape"] = std::vector<Json>(2);
|
|
array_interface["shape"][0] = rows;
|
|
array_interface["shape"][1] = cols;
|
|
|
|
array_interface["typestr"] = String("<f4");
|
|
array_interface["version"] = 1;
|
|
return array_interface;
|
|
}
|
|
|
|
std::string RandomDataGenerator::GenerateArrayInterface(
|
|
HostDeviceVector<float> *storage) const {
|
|
auto array_interface = this->ArrayInterfaceImpl(storage, rows_, cols_);
|
|
std::string out;
|
|
Json::Dump(array_interface, &out);
|
|
return out;
|
|
}
|
|
|
|
|
|
|
|
std::string RandomDataGenerator::GenerateColumnarArrayInterface(
|
|
std::vector<HostDeviceVector<float>> *data) const {
|
|
CHECK(data);
|
|
CHECK_EQ(data->size(), cols_);
|
|
auto& storage = *data;
|
|
Json arr { Array() };
|
|
for (size_t i = 0; i < cols_; ++i) {
|
|
auto column = this->ArrayInterfaceImpl(&storage[i], rows_, 1);
|
|
get<Array>(arr).emplace_back(column);
|
|
}
|
|
std::string out;
|
|
Json::Dump(arr, &out);
|
|
return out;
|
|
}
|
|
|
|
void RandomDataGenerator::GenerateCSR(
|
|
HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
|
|
HostDeviceVector<bst_feature_t>* columns) const {
|
|
auto& h_value = value->HostVector();
|
|
auto& h_rptr = row_ptr->HostVector();
|
|
auto& h_cols = columns->HostVector();
|
|
|
|
SimpleLCG lcg{seed_};
|
|
xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
|
|
float sparsity = sparsity_ * (upper_ - lower_) + lower_;
|
|
|
|
h_rptr.emplace_back(0);
|
|
for (size_t i = 0; i < rows_; ++i) {
|
|
size_t rptr = h_rptr.back();
|
|
for (size_t j = 0; j < cols_; ++j) {
|
|
auto g = dist(&lcg);
|
|
if (g >= sparsity) {
|
|
g = dist(&lcg);
|
|
h_value.emplace_back(g);
|
|
rptr++;
|
|
h_cols.emplace_back(j);
|
|
}
|
|
}
|
|
h_rptr.emplace_back(rptr);
|
|
}
|
|
|
|
if (device_ >= 0) {
|
|
value->SetDevice(device_);
|
|
value->DeviceSpan();
|
|
row_ptr->SetDevice(device_);
|
|
row_ptr->DeviceSpan();
|
|
columns->SetDevice(device_);
|
|
columns->DeviceSpan();
|
|
}
|
|
|
|
CHECK_LE(h_value.size(), rows_ * cols_);
|
|
CHECK_EQ(value->Size(), h_rptr.back());
|
|
CHECK_EQ(columns->Size(), value->Size());
|
|
}
|
|
|
|
std::shared_ptr<DMatrix>
|
|
RandomDataGenerator::GenerateDMatix(bool with_label, bool float_label,
|
|
size_t classes) const {
|
|
HostDeviceVector<float> data;
|
|
HostDeviceVector<bst_row_t> rptrs;
|
|
HostDeviceVector<bst_feature_t> columns;
|
|
this->GenerateCSR(&data, &rptrs, &columns);
|
|
data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(),
|
|
data.HostPointer(), rows_, data.Size(), cols_);
|
|
std::shared_ptr<DMatrix> out{
|
|
DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
|
|
|
|
if (with_label) {
|
|
RandomDataGenerator gen(rows_, 1, 0);
|
|
if (!float_label) {
|
|
gen.Lower(0).Upper(classes).GenerateDense(&out->Info().labels_);
|
|
auto& h_labels = out->Info().labels_.HostVector();
|
|
for (auto& v : h_labels) {
|
|
v = static_cast<float>(static_cast<uint32_t>(v));
|
|
}
|
|
} else {
|
|
gen.GenerateDense(&out->Info().labels_);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrix(
|
|
size_t n_entries, size_t page_size, std::string tmp_file) {
|
|
// Create sufficiently large data to make two row pages
|
|
CreateBigTestData(tmp_file, n_entries);
|
|
std::unique_ptr<DMatrix> dmat { DMatrix::Load(
|
|
tmp_file + "#" + tmp_file + ".cache", true, false, "auto", page_size)};
|
|
EXPECT_TRUE(FileExists(tmp_file + ".cache.row.page"));
|
|
|
|
// Loop over the batches and count the records
|
|
int64_t batch_count = 0;
|
|
int64_t row_count = 0;
|
|
for (const auto &batch : dmat->GetBatches<xgboost::SparsePage>()) {
|
|
batch_count++;
|
|
row_count += batch.Size();
|
|
}
|
|
EXPECT_GE(batch_count, 2);
|
|
EXPECT_EQ(row_count, dmat->Info().num_row_);
|
|
|
|
return dmat;
|
|
}
|
|
|
|
|
|
std::unique_ptr<DMatrix> CreateSparsePageDMatrixWithRC(
|
|
size_t n_rows, size_t n_cols, size_t page_size, bool deterministic,
|
|
const dmlc::TemporaryDirectory& tempdir) {
|
|
if (!n_rows || !n_cols) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Create the svm file in a temp dir
|
|
const std::string tmp_file = tempdir.path + "/big.libsvm";
|
|
|
|
std::ofstream fo(tmp_file.c_str());
|
|
size_t cols_per_row = ((std::max(n_rows, n_cols) - 1) / std::min(n_rows, n_cols)) + 1;
|
|
int64_t rem_cols = n_cols;
|
|
size_t col_idx = 0;
|
|
|
|
// Random feature id generator
|
|
std::random_device rdev;
|
|
std::unique_ptr<std::mt19937> gen;
|
|
if (deterministic) {
|
|
// Seed it with a constant value for this configuration - without getting too fancy
|
|
// like ordered pairing functions and its likes to make it truely unique
|
|
gen.reset(new std::mt19937(n_rows * n_cols));
|
|
} else {
|
|
gen.reset(new std::mt19937(rdev()));
|
|
}
|
|
std::uniform_int_distribution<size_t> label(0, 1);
|
|
std::uniform_int_distribution<size_t> dis(1, n_cols);
|
|
|
|
for (size_t i = 0; i < n_rows; ++i) {
|
|
// Make sure that all cols are slotted in the first few rows; randomly distribute the
|
|
// rest
|
|
std::stringstream row_data;
|
|
size_t j = 0;
|
|
if (rem_cols > 0) {
|
|
for (; j < std::min(static_cast<size_t>(rem_cols), cols_per_row); ++j) {
|
|
row_data << label(*gen) << " " << (col_idx + j) << ":"
|
|
<< (col_idx + j + 1) * 10 * i;
|
|
}
|
|
rem_cols -= cols_per_row;
|
|
} else {
|
|
// Take some random number of colums in [1, n_cols] and slot them here
|
|
std::vector<size_t> random_columns;
|
|
size_t ncols = dis(*gen);
|
|
for (; j < ncols; ++j) {
|
|
size_t fid = (col_idx + j) % n_cols;
|
|
random_columns.push_back(fid);
|
|
}
|
|
std::sort(random_columns.begin(), random_columns.end());
|
|
for (auto fid : random_columns) {
|
|
row_data << label(*gen) << " " << fid << ":" << (fid + 1) * 10 * i;
|
|
}
|
|
}
|
|
col_idx += j;
|
|
|
|
fo << row_data.str() << "\n";
|
|
}
|
|
fo.close();
|
|
|
|
std::string uri = tmp_file;
|
|
if (page_size > 0) {
|
|
uri += "#" + tmp_file + ".cache";
|
|
}
|
|
std::unique_ptr<DMatrix> dmat(
|
|
DMatrix::Load(uri, true, false, "auto", page_size));
|
|
return dmat;
|
|
}
|
|
|
|
gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, size_t n_classes) {
|
|
gbm::GBTreeModel model(param);
|
|
|
|
for (size_t i = 0; i < n_classes; ++i) {
|
|
std::vector<std::unique_ptr<RegTree>> trees;
|
|
trees.push_back(std::unique_ptr<RegTree>(new RegTree));
|
|
if (i == 0) {
|
|
(*trees.back())[0].SetLeaf(1.5f);
|
|
(*trees.back()).Stat(0).sum_hess = 1.0f;
|
|
}
|
|
model.CommitModel(std::move(trees), i);
|
|
}
|
|
|
|
return model;
|
|
}
|
|
|
|
std::unique_ptr<GradientBooster> CreateTrainedGBM(
|
|
std::string name, Args kwargs, size_t kRows, size_t kCols,
|
|
LearnerModelParam const* learner_model_param,
|
|
GenericParameter const* generic_param) {
|
|
auto caches = std::make_shared< PredictionContainer >();;
|
|
std::unique_ptr<GradientBooster> gbm {
|
|
GradientBooster::Create(name, generic_param, learner_model_param)};
|
|
gbm->Configure(kwargs);
|
|
auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatix();
|
|
|
|
std::vector<float> labels(kRows);
|
|
for (size_t i = 0; i < kRows; ++i) {
|
|
labels[i] = i;
|
|
}
|
|
p_dmat->Info().labels_.HostVector() = labels;
|
|
HostDeviceVector<GradientPair> gpair;
|
|
auto& h_gpair = gpair.HostVector();
|
|
h_gpair.resize(kRows);
|
|
for (size_t i = 0; i < kRows; ++i) {
|
|
h_gpair[i] = {static_cast<float>(i), 1};
|
|
}
|
|
|
|
PredictionCacheEntry predts;
|
|
|
|
gbm->DoBoost(p_dmat.get(), &gpair, &predts);
|
|
|
|
return gbm;
|
|
}
|
|
|
|
} // namespace xgboost
|